Beispiel #1
0
 def tb_paths(self):
     for url in range(len(self.ticpaths.dataframe)):
         for i in range(0, 10):
             if URL(self.ticpaths.dataframe[['tic_url']].iloc[url].tic_url).path[0 + i] != "NaN":
                 self.dataframe.at[len(self.dataframe), 'alz_path_path'] =URL(self.ticpaths.dataframe[['tic_url']].iloc[url].tic_url).path[0 + i]
                 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_public_id'] = self.ticpaths.dataframe.at[url, 'cli_public_id']
                 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_level'] = i + 1
                 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_recurrence'] = 1
                 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_ticket_id'] = self.ticpaths.dataframe.at[url, 'tic_public_id_ticket']
                 self.dataframe.at[len(self.dataframe) - 1, 'alz_client_score'] = Operations.clientscore(self,
                                                                                                         Table.tb_acumulative(self,
                                                                                                                              Database.db_perform_search(self,
                                                                                                                                                         self.table_name,
                                                                                                                                                         'alz_path_recurrence',
                                                                                                                                                         ['alz_path_public_id','alz_path_path'],
                                                                                                                                                         (self.dataframe.loc[len(self.dataframe) - 1][['alz_path_public_id','alz_path_path']]).tolist()
                                                                                                                                                         )
                                                                                                                              ),
                                                                                                         Table.tb_acumulative(self,
                                                                                                                              Database.db_perform_search(self,
                                                                                                                                                         self.table_name,
                                                                                                                                                         'alz_path_recurrence',
                                                                                                                                                         ['alz_path_path'],
                                                                                                                                                         (self.dataframe.loc[len(self.dataframe) - 1][['alz_path_path']]).tolist()
                                                                                                                                                         )
                                                                                                                              )
                                                                                                         )
     return self.dataframe
Beispiel #2
0
 def test_canonicalFormForValidURLs(self):
     alreadyCanonicalForm = [
         "http://example.com/", "http://example.com/?q=%C3%871",
         "http://example.com/?q=%E2%85%A0", "http://example.com/?q=%5C",
         "http://example.com/~jane", "http://example.com/a/b",
         "http://example.com:8080/", "http://*****:*****@example.com/",
         "ftp://ftp.is.co.za/rfc/rfc1808.txt",
         "http://www.ietf.org/rfc/rfc2396.txt"
     ]
     self.assertEqual(
         URL(alreadyCanonicalForm[0]).normalized, alreadyCanonicalForm[0])
     self.assertEqual(
         URL(alreadyCanonicalForm[1]).normalized, alreadyCanonicalForm[1])
     self.assertEqual(
         URL(alreadyCanonicalForm[2]).normalized, alreadyCanonicalForm[2])
     self.assertEqual(
         URL(alreadyCanonicalForm[3]).normalized, alreadyCanonicalForm[3])
     self.assertEqual(
         URL(alreadyCanonicalForm[4]).normalized, alreadyCanonicalForm[4])
     self.assertEqual(
         URL(alreadyCanonicalForm[5]).normalized, alreadyCanonicalForm[5])
     self.assertEqual(
         URL(alreadyCanonicalForm[6]).normalized, alreadyCanonicalForm[6])
     self.assertEqual(
         URL(alreadyCanonicalForm[7]).normalized, alreadyCanonicalForm[7])
     self.assertEqual(
         URL(alreadyCanonicalForm[8]).normalized, alreadyCanonicalForm[8])
     self.assertEqual(
         URL(alreadyCanonicalForm[9]).normalized, alreadyCanonicalForm[9])
 def set_url(self, url):
     if isinstance(url, URL):
         self.url = str(url)
         self.url_split = url
     else:
         self.url = url
         self.url_split = URL(self.url)
Beispiel #4
0
 def _get_urls(self):
     urls = []
     for element in self._get_elements('Url'):
         template = element.getAttribute('template')
         type = element.getAttribute('type')
         if template and type:
             url = URL()
             url.template = template
             url.type = type
             urls.append(url)
     return urls
Beispiel #5
0
 def _get_urls(self):
     urls = []
     for element in self._get_elements('Url'):
         template = element.getAttribute('template')
         type = element.getAttribute('type')
         if template and type:
             url = URL()
             url.template = template
             url.type = type
             urls.append(url)
     return urls
Beispiel #6
0
class TestSettings(unittest.TestCase):

    def test_defaults(self):
        self.url = URL('front1.example.co.uk', useDefaults=True)
        self.assertEqual(self.url.url, 'http://front1.example.co.uk:80/')

    def test_file_ext_optional(self):
        self.url = URL('example.com/path/to/index', fileExtensionOptional=True)
        self.assertEqual(self.url.path, '/path/to/index')
        self.url.move_up_level()
        self.assertEqual(self.url.path, '/path/')
Beispiel #7
0
 def test_validator(self):
   url = URL('www.google.com')
   self.assertFalse(url.isValid())
   url = URL('google.com')
   self.assertFalse(url.isValid())
   url = URL('https://www.google.com/googleplus/whousesthis')
   self.assertTrue(url.isValid())
   url = URL('https://www.google.com/googleplus/whousesthis?true=false&not=really')
   self.assertTrue(url.isValid())
   url = URL('https://www.google.com/')
   self.assertTrue(url.isValid())
Beispiel #8
0
def fetch_url(protocol,
              hostname,
              port,
              useragent,
              timeout,
              retries,
              storage=[]):
    url_obj = URL(protocol, hostname, str(port), useragent, timeout,
                  retries)  # define url object
    url_obj.getdata()
    storage.append(url_obj.result)
Beispiel #9
0
 def runTest(self):
     url1 = URL(value1)
     url2 = URL(value2)
     if comp == '==':
         assert (url1 == url2) == expected, \
                (expected, comp, value1, value2)
     elif comp == '>':
         assert (url1 > url2) == expected, \
                (expected, comp, value1, value2)
     elif comp == '<':
         assert (url1 < url2) == expected, \
                (expected, comp, value1, value2)
Beispiel #10
0
 def test_canonicalizer(self):
     url = URL('www.google.com').getNormalized()
     self.assertEqual(url, 'http://www.google.com/')
     url = URL('google.com').getNormalized()
     self.assertEqual(url, 'http://google.com/')
     url = URL(
         'https://www.google.com/googleplus/whousesthis').getNormalized()
     self.assertEqual(url, 'https://www.google.com/googleplus/whousesthis')
     url = URL(
         'https://www.google.com/googleplus/whousesthis?true=false&not=really'
     ).getNormalized()
     self.assertEqual(
         url,
         'https://www.google.com/googleplus/whousesthis?true=false&not=really'
     )
Beispiel #11
0
    def login(self, email, password, **kwargs):
        """
        登录需要的验证码会保存在当前目录,需要用户自己识别,并输入
        """
        request_body = {
            'email': email,
            'password': password,
            '_xsrf': self._get_xsrf(**kwargs),
            "captcha": self._get_captcha(**kwargs),
            'remember_me': 'true'
        }

        response = self._session.post(URL.login(), data=request_body, **kwargs)
        if response.ok:
            data = response.json()
            if data.get("r") == 0:
                # 登录成功'
                self._session.cookies.save()
                self.logger.info("登录成功")
                return True
            else:
                self.logger.info("登录失败, %s" % data.get("msg"))

        else:
            self.logger.error(response.content)
        return False
Beispiel #12
0
    def send_message(self,
                     content,
                     user_id=None,
                     profile_url=None,
                     user_slug=None,
                     **kwargs):
        """
        给指定的用户发私信
        :param content 私信内容
        :param user_id 用户id
        :param profile_url :用户主页地址
        :param user_slug : 用户的个性域名

        >>> send_message(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi")
        >>> send_message(user_slug = "xiaoxiaodouzi")
        >>> send_message(user_id = "1da75b85900e00adb072e91c56fd9149")
        """

        if not any([user_id, profile_url, user_slug]):
            raise ZhihuError("至少指定一个关键字参数")

        if user_id is None:
            user_slug = self._user_slug(
                profile_url) if user_slug is None else user_slug
            user_id = self._user_id(user_slug)

        data = {"type": "common", "content": content, "receiver_hash": user_id}
        response = self._session.post(URL.message(), json=data, **kwargs)
        data = response.json()
        if data.get("error"):
            self.logger.info("私信发送失败, %s" % data.get("error").get("message"))
        else:
            self.logger.info("发送成功")
        return data
Beispiel #13
0
    def user(self, user_slug=None, profile_url=None):
        """
        获取用户信息
        :param user_slug : 用户的个性域名
        :param profile_url: 用户主页地址

        :return:dict

        >>> user(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi")
        >>> user(user_slug = "xiaoxiaodouzi")

        """

        if not any([profile_url, user_slug]):
            raise Exception("至少指定一个关键字参数")

        if not user_slug and profile_url:

            pattern = re.compile("https?://www.zhihu.com/people/([\w-]+)")
            match = pattern.search(profile_url)
            if match:
                user_slug = match.group(1)
        response = self._session.get(URL.profile(user_slug))
        if response.ok:
            return response.json()
        else:
            self.logger.error(u"获取用户信息失败, status code: %s" %
                              response.status_code)
Beispiel #14
0
 def set_url(self, url):
     if isinstance(url, URL):
         self.url = str(url)
         self.url_split = url
     else:
         self.url = url
         self.url_split = URL(self.url)
Beispiel #15
0
 def _get_captcha(self, **kwargs):
     t = str(int(time.time() * 1000))
     r = self._session.get(URL.captcha(t), **kwargs)
     with open('captcha.jpg', 'wb') as f:
         f.write(r.content)
     captcha = input("验证码:")
     return captcha
Beispiel #16
0
  def test_eq(self):
    url1 = URL('www.google.com');
    url2 = URL('www.google.com/');
    self.assertTrue(url2.__eq__(url1))
        
    url1 = URL('google.com');
    url2 = URL('www.google.com/');
    self.assertFalse(url2.__eq__(url1))

    url1 = URL('www.google.com/hello');
    url2 = URL('www.google.com/');
    self.assertFalse(url2.__eq__(url1))

    url1 = URL('https://www.google.com/');
    url2 = URL('http://www.google.com/');
    self.assertFalse(url2.__eq__(url1))

    url1 = URL('google.com');
    url2 = URL('google.com');
    self.assertTrue(url2.__eq__(url1))
Beispiel #17
0
    def process_header(self, ctx: Context):
        host_url = URL(ctx.header.host)
        cfg = self.cfg_by_domain(host_url.host)
        ctx.cfg = cfg

        if cfg.get('x-forward-for'):
            ctx.header.args['X-Forwarded-For'] = ctx.src_addr[0]

        if cfg.get('-'):
            for arg in cfg['-']:
                utils.del_key(ctx.header.args, arg)
Beispiel #18
0
 def __process(self, task):
     """
     Collect urls from the web page.
     """
     task_url = URL(task)
     soup = bs(self.__response.content, "lxml")
     # Get all links from current page.Remove duplicated links.
     url_set = set(
         item.get("href") for item in soup.find_all(lambda tag: tag.get(
             "href") and "javascript" not in tag.get("href")))
     target = set()
     # Construct  new target links
     for item in url_set:
         u = URL(item)
         if u.netloc and u.netloc not in Worker.Worker_basedomain:
             continue
         else:
             u.standardize(task_url.url)
             std_url = u.url
             target.add(std_url)
     return target
Beispiel #19
0
    def config(self, **config):
        self._config = config
        for key, value in config.items():
            self.__dict__[key] = value

        if not self.path:
            raise ValueError("Path is Missing")

        if not self.urls:
            raise ValueError("Uri is Missing")
        self._urls = [URL(url) for url in self.urls]

        self.init_service()
Beispiel #20
0
    def follow(self, user_slug=None, profile_url=None, **kwargs):
        """
        关注用户
        :param user_slug:
        :param profile_url:
        :return: {"follower_count": int}

        >>> follow(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi")
        >>> follow(user_slug = "xiaoxiaodouzi")
        """
        if not any([profile_url, user_slug]):
            raise ZhihuError("至少指定一个关键字参数")

        user_slug = self._user_slug(
            profile_url) if user_slug is None else user_slug
        logging.info(URL.follow(user_slug))
        response = self._session.post(URL.follow(user_slug), **kwargs)
        logging.info(response.text)
        if response.ok:
            return response.json()
        else:
            self.logger.error(u"关注失败, status code: %s" % response.status_code)
Beispiel #21
0
 def setupURLs(self):
     self.urls = []
     origins = self.getOrigins()
     destination = self.destinations[0]
     for origin in origins:
         startdate = self.getStartDate()
         enddate = self.getEndDate()
         url = URL({
             "origin": origin,
             "destination": destination,
             "date1": startdate,
             "date2": enddate
         })
         self.urls.append(url)
Beispiel #22
0
  def test_gt(self):
    url1 = URL('www.google.com');
    url2 = URL('www.google.com/hello/world');
    self.assertTrue(url2.__gt__(url1))

    url1 = URL('www.google.com');
    url2 = URL('www.google.com/?hello=world');
    self.assertTrue(url2.__gt__(url1))

    url1 = URL('www.google.com/a');
    url2 = URL('www.google.com/b/oh');
    self.assertTrue(url2.__gt__(url1))

    url1 = URL('www.foogle.com/what');
    url2 = URL('www.google.com/');
    self.assertTrue(url2.__gt__(url1))
def construct_mongo_url():
    """ Construct URL for connecting to MongoDB."""
    url = URL('')
    if MONGO_CONFIG.host in ['127.0.0.1', 'localhost']:
        url.scheme = 'mongodb'
        url.netloc = f'{MONGO_CONFIG.host}:{MONGO_CONFIG.port}'
    else:
        url.scheme = 'mongodb+srv'
        url.netloc = f'{MONGO_CONFIG.username}:{MONGO_CONFIG.password}@{MONGO_CONFIG.host}'
        url.path = MONGO_CONFIG.database
        url.query_param.set('retryWrites', 'true')
        url.query_param.set('w', 'majority')
    return str(url)
Beispiel #24
0
def url_to_X(url_name, features_names_file):
    METHOD = 'Selenium' # 'Selenium' or 'urllib2'
    UA = 'PhantomJS' # 'firefox' or None   
                     # PhantomJS requires command: phantomjs  --webdriver 28042
    f_names = joblib.load(features_names_file)
    n = len(f_names)
    X = []   
    
    u = URL(url_name)
    u.process(user_agent = UA, method = METHOD, to_reload = True, collection = None)
    static_features_url = u.static_features
    dynamic_features_url = u.dynamic_features
    features_url = {}
    features_url.update(static_features_url)
    features_url.update(dynamic_features_url)
    
    x = [0]*n
    for i in xrange(n):        
        try:
            x[i] = features_url[f_names[i]]
        except:
            pass
    X.append(x)   
    return X
Beispiel #25
0
def main():
    # system prompt
    if len(sys.argv) is not 2:
        print 'use: python main.py <input-file>\n'
        exit(1)

    # read input file
    f = open(sys.argv[1])
    lines = readfile(f)

    # create URL objects
    urls = [URL(x) for x in lines]

    # count occurrences of original and canonical URLS
    dic_source = {}
    dic_canonical = {}

    for url in urls:
        if url.getURL() in dic_source:
            dic_source[url.getURL()] = dic_source[url.getURL()] + 1
        else:
            dic_source[url.getURL()] = 1
        if url.getNormalized() in dic_canonical:
            dic_canonical[
                url.getNormalized()] = dic_canonical[url.getNormalized()] + 1
        else:
            dic_canonical[url.getNormalized()] = 1

    # print properties of each URL read
    for url in urls:
        print "Source: " + url.getURL()

        if url.isValid():
            print "Valid: True"
        else:
            print "Valid: False"

        print "Canonical: " + url.getNormalized()

        if dic_source[url.getURL()] == 1:
            print "Source unique: True"
        else:
            print "Source unique: False"

        if dic_canonical[url.getNormalized()] == 1:
            print "Canonicalized URL unique: True\n"
        else:
            print "Canonicalized URL unique: False\n"
Beispiel #26
0
    def forward(self, ctx):
        up_url = URL(ctx.cfg['upstream'])

        if up_url.protocol == Protocol.unix:
            right_conn = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
            right_conn.connect(up_url.host)
        else:
            right_conn = socket.socket()
            right_conn.connect((up_url.host, up_url.port))
        right_conn.sendall(ctx.header.encode())

        t = spawn(self.relay, (ctx.left_conn, right_conn))
        self.relay(right_conn, ctx.left_conn)
        t.join()

        ctx.left_conn.close()
        right_conn.close()
Beispiel #27
0
 def __init__(self):
     self.db = Database()
     self.nlpir = PyNLPIR(self)
     self.renren = Renren(self)
     self.url = URL(self)
     self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0"
     self.pos_blacklist_regexs = [
         "^emoticon$",
         "^title$",
         "^ude.",
         "^w.*",
         "^vshi",
         "^vyou",
         "^p.*",
         "^ule",
         "^m.*",
         "^cc",
         "^session$",
     ]
Beispiel #28
0
 def __make_request(self, task):
     """
     Send request to target.
     :return: None
     """
     self.__response = None
     try:
         self.__response = self.__session.get(task)
     except requests.exceptions.HTTPError as e:
         # If the remote server return a response with 4xx or 5xx code,
         # requests will raise a HTTPError
         self.__response = e.response
     except requests.exceptions.RequestException as e:
         msg = "{url} - {error}".format(url=task.encode("utf-8"), error=e)
         Worker.Worker_error_logger.warning(msg)
         Worker.Worker_debug_logger.warning(msg)
         raise TaskAbort()
     except Exception as e:
         # In case of unexpected errors.
         msg = "{url} - An unexpected error occurred: {err}".format(
             url=task.encode("utf-8"), err=e)
         Worker.Worker_sys_logger.exception(msg)
         Worker.Worker_debug_logger.exception(msg)
         raise TaskAbort()
     else:
         # If we are redirected to a domain that not in basedomain ,
         # ignore it.
         if Worker.Worker_basedomain and URL(
                 self.__response.url
         ).netloc not in Worker.Worker_basedomain:
             raise TaskAbort()
         else:
             # Deal with http errors
             if 400 <= self.__response.status_code <= 599:
                 msg = "{url} - {status}".format(
                     url=task.encode("utf-8"),
                     status=self.__response.status_code)
                 Worker.Worker_error_logger.warning(msg)
                 Worker.Worker_debug_logger.warning(msg)
                 raise TaskAbort()
Beispiel #29
0
    def send_message(self,
                     content,
                     user_id=None,
                     profile_url=None,
                     user_slug=None):
        """
        给指定的用户发私信
        :param content 私信内容
        :param user_id 用户id
        :param profile_url :用户主页地址
        :param user_slug : 用户的个性域名

        >>> send_message(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi")
        >>> send_message(user_slug = "xiaoxiaodouzi")
        >>> send_message(user_id = "1da75b85900e00adb072e91c56fd9149")
        """

        if not any([user_id, profile_url, user_slug]):
            raise Exception("至少指定一个关键字参数")

        if not user_id and user_slug:
            profile = self.user(user_slug)
            user_id = profile.get("id")
        elif not user_id and profile_url:
            pattern = re.compile("https?://www.zhihu.com/people/([\w-]+)")
            match = pattern.search(profile_url)
            if match:
                user_slug = match.group(1)
                profile = self.user(user_slug)
                user_id = profile.get("id")

        data = {"type": "common", "content": content, "receiver_hash": user_id}
        response = self._session.post(URL.message(), json=data)
        data = response.json()
        if data.get("error"):
            self.logger.info("私信发送失败, %s" % data.get("error").get("message"))
        else:
            self.logger.info("发送成功")
        return data
Beispiel #30
0
    def send_subscription_confirmation(cls, email: str,
                                       subs_lst: List[DB.EmailSubscription]):
        """ Send the email for confirmation of email subscription."""
        confirmation_url = URL(
            f'https://{FRONTEND_BASE_URI}/visa/email/subscription')
        confirmation_url.query_param.set('email', email)
        for visa_type, code, till in subs_lst:
            confirmation_url.query_param.append('visa_type', visa_type.value)
            confirmation_url.query_param.append('code', code.value)
            confirmation_url.query_param.append('till', till)

        subscription_str = '<ul>\n{}\n</ul>'.format('\n'.join([
            '<li>{} Visa at {} till {}.</li>'.format(
                VISA_TYPE_DETAILS[vt],
                next((e.name_en
                      for e in USEmbassy.get_embassy_lst() if e.code == ec),
                     'None'),
                tl.strftime('%Y/%m/%d') if tl != datetime.max else 'FOREVER',
            ) for vt, ec, tl in subs_lst
        ]))

        content = SUBSCRIPTION_CONFIRMATION_CONTENT.format(
            user=email.split('@')[0],
            email=email,
            subscription_str=subscription_str,
            confirmation_url=confirmation_url,
        )

        for _ in range(10):  # for robust
            sent = cls.send_email(
                title=SUBSCRIPTION_CONFIRMATION_TITLE.format(email=email),
                content=content,
                receivers=[email])
            if sent:
                break
        else:
            sent = False

        return sent
def main():
    filename = None
    if len(sys.argv) is not 2:
        print 'Usage: python main.py input-file'
        exit(1)

    inputfile = open(sys.argv[1])

    strings = get_strings(inputfile)
    urls = [URL(x) for x in strings]
    normalized = {}
    original = {}
    for url in urls:
        if url.getURL() in original:
            original[url.getURL()] = original[url.getURL()] + 1
        else:
            original[url.getURL()] = 1

        if url.getNormalized() in normalized:
            normalized[
                url.getNormalized()] = normalized[url.getNormalized()] + 1
        else:
            normalized[url.getNormalized()] = 1

    for url in urls:
        print "Source: " + url.getURL()
        if url.isValid():
            print "Valid: True"
        else:
            print "Valid: False"
        print "Canonical: " + url.getNormalized()
        if original[url.getURL()] == 1:
            print "Source unique: True"
        else:
            print "Source unique: False"
        if normalized[url.getNormalized()] == 1:
            print "Canonicalized unique: True"
        else:
            print "Canonicalized unique: False"
Beispiel #32
0
    def user(self, user_slug=None, profile_url=None, **kwargs):
        """
        获取用户信息
        :param user_slug : 用户的个性域名
        :param profile_url: 用户主页地址

        :return:dict

        >>> user(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi")
        >>> user(user_slug = "xiaoxiaodouzi")

        """

        if not any([profile_url, user_slug]):
            raise ZhihuError("至少指定一个关键字参数")

        user_slug = self._user_slug(
            profile_url) if user_slug is None else user_slug
        response = self._session.get(URL.profile(user_slug), **kwargs)
        if response.ok:
            return response.json()
        else:
            self.logger.error(u"获取用户信息失败, status code: %s" %
                              response.status_code)
Beispiel #33
0
 def test_defaults(self):
     self.url = URL('front1.example.co.uk', useDefaults=True)
     self.assertEqual(self.url.url, 'http://front1.example.co.uk:80/')
Beispiel #34
0
class CompatRequest(object):
    """ urllib / cookielib compatible request class. 
        See also: http://docs.python.org/library/cookielib.html 
    """
    def __init__(self, url, method='GET', headers=None, payload=None):
        self.set_url(url)
        self.original_host = self.url_split.host
        self.method = method
        self.headers = headers
        self.payload = payload

    def set_url(self, url):
        if isinstance(url, URL):
            self.url = str(url)
            self.url_split = url
        else:
            self.url = url
            self.url_split = URL(self.url)

    def get_full_url(self):
        return self.url

    def get_host(self):
        return self.url_split.host

    def get_type(self):
        return self.url_split.scheme

    def get_origin_req_host(self):
        return self.original_host

    def is_unverifiable(self):
        """ See http://tools.ietf.org/html/rfc2965.html. Not fully implemented! 
        """
        return False

    def get_header(self, header_name, default=None):
        return self.headers.get(header_name, default)

    def has_header(self, header_name):
        return header_name in self.headers

    def header_items(self):
        return self.headers.items()

    def add_unredirected_header(self, key, val):
        self.headers.add(key, val)
        
    def _drop_payload(self):
        self.method = 'GET'
        self.payload = None
        for item in ('content-length', 'content-type', 'content-encoding'):
            self.headers.discard(item) 
    
    def _drop_cookies(self):
        for item in ('cookie', 'cookie2'):
            self.headers.discard(item)
            
    def redirect(self, code, location):
        """ Modify the request inplace to point to the new location """
        self.set_url(self.url_split.redirect(location))
        if code in (302, 303):
            self._drop_payload()
        self._drop_cookies()
Beispiel #35
0
 def test_file_ext_optional(self):
     self.url = URL('example.com/path/to/index', fileExtensionOptional=True)
     self.assertEqual(self.url.path, '/path/to/index')
     self.url.move_up_level()
     self.assertEqual(self.url.path, '/path/')
Beispiel #36
0
from url import URL

def get_strings(f):
  strings = []
  line = f.readline()
  while len(line) > 0:
    # do not add empty line
    if len(line) > 1:
      strings.append(line[:len(line) - 1])
    line = f.readline()
  return strings

if __name__ == "__main__":
  filename = None
  if len(sys.argv) not in (3,3):
    print 'Usage: python main.py input-file output-file'
    exit(1)

  inputfile = open(sys.argv[1])

  strings = get_strings(inputfile)
  urls = [URL(x) for x in strings]

  outputfile = open(sys.argv[2], 'w+')
  results = algorithms.analyzeURLs(urls)
  for item in results:
    outputfile.write('%s\n' % item)

  inputfile.close()
  outputfile.close()
Beispiel #37
0
class TestUrlMethods(unittest.TestCase):

    def setUp(self):
        self.url = URL('sub.example.co.uk/path/to/file.ext?query=parameter&foo=bar')

    def test_set_path(self):
        self.url.path = 'path/to/file.ext'
        self.assertEqual(self.url.path, '/path/to/file.ext')

    def test_set_basename(self):
        self.url.basename = 'newfile.ext'
        self.assertEqual(self.url.path, '/path/to/newfile.ext')

    def test_get_query(self):
        self.assertEqual(self.url.get_query(), 'query=parameter&foo=bar')

    def test_get_single_query(self):
        self.assertEqual(self.url.get_query('foo'), 'bar')

    def test_update_query(self):
        self.url.update_query('biz', 'bazz')
        self.assertEqual(self.url.get_query('biz'), 'bazz')

    def test_overwrite_query(self):
        self.url.update_query('biz', 'booz')
        self.assertEqual(self.url.get_query('biz'), 'booz')

    def test_return_updated_query(self):
        self.url.update_query('biz', 'booz')
        self.assertEqual(self.url.query, 'query=parameter&foo=bar&biz=booz')        

    def test_is_subdomain_of(self):
        self.assertEqual(self.url.is_subdomain_of('example.co.uk'), True)

    def test_is_sub_subdomain_of(self):
        self.url = URL('http://dev.front1.example.co.uk')
        self.assertEqual(self.url.is_subdomain_of('front1.example.co.uk'), True)

    def test_is_parent_domain_of(self):
        self.assertEqual(self.url.is_parent_domain_of('dev1.sub.example.co.uk'), True)

    def test_move_up_level(self):
        self.url.move_up_level()
        self.assertEqual(self.url.path, '/path/')

    def test_move_up_to_top_level(self):
        self.url.move_up_level()
        self.url.move_up_level()
        self.url.move_up_level()
        self.assertEqual(self.url.path, '/')

    def test_validate(self):
        self.assertEqual(self.url.validate(self.url.url), True)

    def test_validate_fails(self):
        self.assertEqual(self.url.validate('h://test'), False)
Beispiel #38
0
 def setUp(self):
     self.url = URL('sub.example.co.uk/path/to/file.ext?query=parameter&foo=bar')
Beispiel #39
0
class Request:
    
    # A list of all non-top-level methods including network and site-specific ones
    # This list is needed because __getattr__ needs to differentiate between methods and parameters
    # Note: 'create' is omitted here because it duplicates functionality found in the Filter class
    _methods = ['add',
                'advanced',
                'answers',
                'associated',
                'badges',
                'comments',
                'de-authenticate',
                'delete',
                'edit',
                'elected',
                'faq',
                'favorites',
                'featured',
                'full',
                'inbox',
                'info',
                'invalidate',
                'linked',
                'mentioned',
                'merges',
                'moderator-only',
                'moderators',
                'name',
                'no-answers',
                'notifications',
                'privileges',
                'questions',
                'recipients',
                'related',
                'reputation',
                'reputation-history',
                'required',
                'revisions',
                'suggested-edits',
                'synonyms',
                'tags',
                'timeline',
                'top-answer-tags',
                'top-answerers',
                'top-answers',
                'top-askers',
                'top-question-tags',
                'top-questions',
                'unaccepted',
                'unanswered',
                'unread',
                'wikis',
                'write-permissions',]
    
    # The presence of any of these methods will force all parameters to be
    # passed as POST parameters instead of with GET.
    _post_methods = ['add',
                     'delete',
                     'edit',]
    
    ## Creates a request object.
    # @param url the domain name to initialize the URL to or a URL instance
    # @param method a method name to append to the URL
    # @param response_type an optional type to use for returning the response
    def __init__(self, url=None, method=None, response_type=Item):
        self._url = URL(url) if isinstance(url, basestring) else url
        if not method is None:
            self._url.add_method(method)
        self._response_type = response_type
        self._data = None
    
    ## Provides a way to specify IDs.
    # @param items either a single item or a list/tuple of items
    def __call__(self, items):
        self._url.add_method(self._string_list(items), True)
        return self
    
    ## Appends the specified item to the appropriate part of the URL.
    # @param raw_item the item to be added
    #
    # Note: any underscores in the item name are converted to dashes.
    def __getattr__(self, raw_item):
        # access_token is a singular exception to this rule
        item = raw_item if raw_item == 'access_token' else raw_item.replace('_', '-')
        # No matter what, we're going to be modifying the URL, so make
        # a deep copy of it
        url = deepcopy(self._url)
        if item in self._methods:
            if item in self._post_methods:
                url.switch_to_post()
            return Request(url, item)
        else:
            # This is a neat trick - we return a local function that will
            # finish setting the parameter in the URL once the user provides
            # the value for the specified parameter (which may be a list).
            def set_parameter(value):
                url.add_parameter(item, self._string_list(value))
                return Request(url)
            return set_parameter
    
    ## Retrieves the item or data at the specified index and returns it.
    # @param index the index to retrieve the item / data from
    # @return the item / data at the specified index
    #
    # This method serves a dual purpose - if supplied with an integer value it
    # will return the item at such an index. If however, supplied with a string,
    # it will return the appropriate value from the response. For example, given
    # the value 'total', it will return the total number of items in the set.
    def __getitem__(self, index):
        return self._fetch()['items'][index] if type(index) == int else self._fetch()[index]
    
    ## Provides a means of iterating through the response.
    # @return an iterator for the response
    def __iter__(self):
        return iter(self._fetch()['items'])
    
    ## Returns the total number of items in the response.
    # @return the number of items in the response
    def __len__(self):
        return len(self._fetch()['items'])
    
    ## Returns an internal representation of the current instance.
    # @return the internal representation
    def __repr__(self):
        return "<Request '%s'>" % self._url
    
    ## Either fetches the data for the request or returns the data.
    # @return the data for the request
    def _fetch(self):
        if self._data is None:
            # Fetch the data and replace the 'items' entry with initialized response objects
            self._data = self._url.fetch()
            if self._url.base_method() in METHOD_TO_TYPE_MAPPING:
                item_type = METHOD_TO_TYPE_MAPPING[self._url.base_method()]
            else:
                item_type = self._data['type'] if 'type' in self._data else ''
            self._data['items'] = [self._response_type(i, item_type) for i in self._data['items']]
        return self._data
    
    ## Converts the provided item or list of items into a string.
    # @param items the list of items to join
    # @return a string with the items joined together
    def _string_list(self, items):
        # Ensure that items is iterable - if not, put it in a list
        try:
            # Trigger the TypeError exception if this object is a string
            # so that it isn't treated like a list
            if isinstance(items, basestring):
                raise TypeError
            iter(items)
        except (KeyError, TypeError):
            items = [items,]
        return ';'.join([str(i.id() if issubclass(i.__class__, Item) else i) for i in items])
Beispiel #40
0
 def test_canonical_url(self):
   url = URL(self.canonical)
   self.assertEqual(url.getCanonicalized(), self.canonical)
Beispiel #41
0
class RecBySNS(object):
    def __init__(self):
        self.db = Database()
        self.nlpir = PyNLPIR(self)
        self.renren = Renren(self)
        self.url = URL(self)
        self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0"
        self.pos_blacklist_regexs = [
            "^emoticon$",
            "^title$",
            "^ude.",
            "^w.*",
            "^vshi",
            "^vyou",
            "^p.*",
            "^ule",
            "^m.*",
            "^cc",
            "^session$",
        ]

    def assign_recbysns_entity_sentiment(self):
        for status in self.db.select_table(
            "weibo_status",
            "text like '%%《%%》%%' or \
                                            text like '%%http://%%' or \
                                            text like '%%https://%%'",
            12696,
            5,
        ):
            sessions = self.nlpir.segment_weibo_status(status["text"])
            i = 0
            while i < len(sessions):
                session = sessions[i]
                entities = []
                session_text = ""
                for segment in session:
                    session_text += segment.rsplit("/", 1)[0]
                    if self.nlpir.get_POS(segment) == "title":
                        title = re.match(u"《(.*?)》/title", segment).group(1)
                        if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title):
                            entities.append(segment)
                    elif self.nlpir.get_POS(segment) == "url":
                        match = re.search(u"(http.*)/url", segment)
                        if match is None:
                            print "###########%s###########" % segment
                            continue
                        url = match.group(1)
                        url = self.db.select_recbysns_url_by_short_url(url)
                        if url is None:
                            print "***********%s***********" % segment
                            continue
                        if self.url.is_video_url(url["origin_url"]):
                            entities.append(segment)
                positions = {}
                for entity in entities:
                    if entity in positions:
                        position = positions[entity] + 1
                        positions[entity] = position
                    else:
                        position = 0
                        positions[entity] = position
                    print status["text"]
                    print session_text
                    print entity
                    print "Type:"
                    type = int(sys.stdin.readline())
                    print "Sentiment:"
                    sentiment = int(sys.stdin.readline())
                    self.db.query(
                        "INSERT INTO recbysns_entity( \
                                   entity, status_id, session, position, \
                                   type, score) \
                                   VALUES(%s, %s, %s, %s, %s, %s)",
                        (entity, status["id"], i, position, type, sentiment),
                    )
                    self.db.commit()
                i = i + 1

    def is_blacklist_word(self, word):
        for pos_blacklist_regex in self.pos_blacklist_regexs:
            if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)):
                return True
        return False
Beispiel #42
0
 def test_invalid(self):
   url = URL(self.invalid)
   self.assertFalse(url.isValid())
Beispiel #43
0
 def test_valid_not_canonical(self):
   url = URL(self.notcanonical)
   self.assertTrue(url.isValid())
def main():
  url1 = URL("example.com")
  url2 = URL(" ")
  url3 = URL("http://example.com/")
  url4 = URL("http://www.example.com")
  url5 = URL("http://z.com/")
  url6 = URL("http://example.com alsdfkj")
  url7 = URL("http://example.com ()")
  url8 = URL("google.com")
  
  
  #testing validity
  if not url1.isValid():
    print "Pass first test"
  else:
    print "Failed first test"

  if not url2.isValid():
    print "Pass second test"
  else:
    print "Failed second test"

  if url3.isValid():
    print "Pass third test"
  else:
    print "Failed third test"

  if not url4.isValid():
    print "Pass fourth test"
  else:
    print "Failed fourth test"

  if url5.isValid():
    print "Pass fifth test"
  else:
    print "Failed fifth test"

  if not url6.isValid():
    print "Pass sixth test"
  else:
    print "Failed sixth test"

  if not url7.isValid():
    print "Pass seventh test"
  else:
    print "Failed seventh test"


  if url3 == url3:
    print "Pass eighth test"
  else:
    print "Failed eighth test"

  if url3 < url5:
    print "Pass ninth test"
  else:
    print "Failed ninth test"

  if url1 != url3:
    print "Pass tenth test"
  else:
    print "Pass tenth test"
Beispiel #45
0
def testValidity(url, expected):
    if URL(url).isValid() == expected:
        print "+ Validity PASS"
    else:
        print "- Validity FAIL"
Beispiel #46
0
    def send_unsubscription_confirmation(cls, email: str):
        """ Send the email for confirmation of email unsubscription. """
        subs_lst_by_email = DB.Subscription.get_subscriptions_by_email(email)
        if len(subs_lst_by_email
               ) == 0:  # If the user has no subscription/email doesn't exist
            for _ in range(10):
                sent = cls.send_email(
                    title=UNSUBSCRIPTION_EMPTY_SUBS_TITLE.format(email=email),
                    content=UNSUBSCRIPTION_EMPTY_SUBS_CONTENT.format(
                        user=email.split('@')[0],
                        email=email,
                        base_uri=FRONTEND_BASE_URI),
                    receivers=[email],
                )
                if sent:
                    break
            else:
                sent = False

            return sent

        unsubs_url = URL(
            f'https://{FRONTEND_BASE_URI}/visa/email/unsubscription'
        )  # Unsubscription confirmation url
        unsubs_url.query_param.set('email', email)

        unsubs_all_url = unsubs_url.copy()
        unsubs_info = []
        for subs in subs_lst_by_email:
            url = unsubs_url.copy()
            url.query_param.set('visa_type', subs['visa_type'])
            url.query_param.set('code', subs['embassy_code'])
            url.query_param.set('till', subs['till'])
            unsubs_info.append((subs['visa_type'], subs['embassy_code'],
                                subs['till'], subs['expired'], url))

            unsubs_all_url.query_param.append('visa_type', subs['visa_type'])
            unsubs_all_url.query_param.append('code', subs['embassy_code'])
            unsubs_all_url.query_param.append('till', subs['till'])

        unsubscription_str = '{}'.format('\n'.join([
            '<li>{} Visa at {} {} on {}: click <a href="{}">this link</a> to unsubscribe.</li>'
            .format(
                VISA_TYPE_DETAILS[vt],
                next((e.name_en
                      for e in USEmbassy.get_embassy_lst() if e.code == ec),
                     'None'),
                'expired' if exp else 'expiring',
                tl.strftime('%Y/%m/%d') if tl.year < 9999 else 'FOREVER',
                url,
            ) for vt, ec, tl, exp, url in unsubs_info
        ]))

        content = UNSUBSCRIPTION_CONFIRMATION_CONTENT.format(
            user=email.split('@')[0],
            email=email,
            unsubscription_str=unsubscription_str,
            unsubscribe_all_url=unsubs_all_url,
        )

        for _ in range(10):
            sent = cls.send_email(title=UNSUBSCRIPTION_CONFIRMATION_TITLE,
                                  content=content,
                                  receivers=[email])

            if sent:
                break
        else:
            sent = False

        return sent
Beispiel #47
0
 def __init__(self, url=None, method=None, response_type=Item):
     self._url = URL(url) if isinstance(url, basestring) else url
     if not method is None:
         self._url.add_method(method)
     self._response_type = response_type
     self._data = None
Beispiel #48
0
    for row in csvreader:
        legit_urls.append(row[0])

with open(more_legit_url_data, 'r', encoding='utf8') as more_legit_file:
    csvreader = csv.reader(more_legit_file)

    data_list = list(csvreader)
    for row in data_list[:50000]:
        legit_urls.append(row[1])

num_legit_urls = len(legit_urls)
num_phish_urls = len(
    phish_urls) if len(phish_urls) <= len(legit_urls) else len(legit_urls)

print(f'num legit: {num_legit_urls}')
print(f'num phish: {num_phish_urls}')

# iterate through urls, making url objects
print('setting up urls')
url_objs = [URL(u, 0).to_json() for u in phish_urls[:num_phish_urls]] + \
    [URL(u, 1).to_json() for u in legit_urls[:num_legit_urls]]

# bulk save them into mongodb databases
print('inserting urls')
new_result = urls.insert_many(url_objs)
print(f'Number of inserts: {len(new_result.inserted_ids)}')

# Disconnect from MongoDB
client.close()
Beispiel #49
0
def has_new_features_to_add(url_name, collection):
    u = URL(url_name)
    feat_names_url = u.get_feature_names()
    feat_names_db = get_features_names(collection)
    res = sorted(feat_names_url['All']) == sorted(feat_names_db)
    return res
def main():
    url1 = URL("example.com")
    url2 = URL(" ")
    url3 = URL("http://example.com/")
    url4 = URL("http://www.example.com")
    url5 = URL("http://z.com/")
    url6 = URL("http://example.com alsdfkj")
    url7 = URL("http://example.com ()")
    url8 = URL("google.com")

    #testing validity
    if not url1.isValid():
        print "Pass first test"
    else:
        print "Failed first test"

    if not url2.isValid():
        print "Pass second test"
    else:
        print "Failed second test"

    if url3.isValid():
        print "Pass third test"
    else:
        print "Failed third test"

    if not url4.isValid():
        print "Pass fourth test"
    else:
        print "Failed fourth test"

    if url5.isValid():
        print "Pass fifth test"
    else:
        print "Failed fifth test"

    if not url6.isValid():
        print "Pass sixth test"
    else:
        print "Failed sixth test"

    if not url7.isValid():
        print "Pass seventh test"
    else:
        print "Failed seventh test"

    if url3 == url3:
        print "Pass eighth test"
    else:
        print "Failed eighth test"

    if url3 < url5:
        print "Pass ninth test"
    else:
        print "Failed ninth test"

    if url1 != url3:
        print "Pass tenth test"
    else:
        print "Pass tenth test"
Beispiel #51
0
 def test_is_sub_subdomain_of(self):
     self.url = URL('http://dev.front1.example.co.uk')
     self.assertEqual(self.url.is_subdomain_of('front1.example.co.uk'), True)