def tb_paths(self): for url in range(len(self.ticpaths.dataframe)): for i in range(0, 10): if URL(self.ticpaths.dataframe[['tic_url']].iloc[url].tic_url).path[0 + i] != "NaN": self.dataframe.at[len(self.dataframe), 'alz_path_path'] =URL(self.ticpaths.dataframe[['tic_url']].iloc[url].tic_url).path[0 + i] self.dataframe.at[len(self.dataframe) - 1, 'alz_path_public_id'] = self.ticpaths.dataframe.at[url, 'cli_public_id'] self.dataframe.at[len(self.dataframe) - 1, 'alz_path_level'] = i + 1 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_recurrence'] = 1 self.dataframe.at[len(self.dataframe) - 1, 'alz_path_ticket_id'] = self.ticpaths.dataframe.at[url, 'tic_public_id_ticket'] self.dataframe.at[len(self.dataframe) - 1, 'alz_client_score'] = Operations.clientscore(self, Table.tb_acumulative(self, Database.db_perform_search(self, self.table_name, 'alz_path_recurrence', ['alz_path_public_id','alz_path_path'], (self.dataframe.loc[len(self.dataframe) - 1][['alz_path_public_id','alz_path_path']]).tolist() ) ), Table.tb_acumulative(self, Database.db_perform_search(self, self.table_name, 'alz_path_recurrence', ['alz_path_path'], (self.dataframe.loc[len(self.dataframe) - 1][['alz_path_path']]).tolist() ) ) ) return self.dataframe
def test_canonicalFormForValidURLs(self): alreadyCanonicalForm = [ "http://example.com/", "http://example.com/?q=%C3%871", "http://example.com/?q=%E2%85%A0", "http://example.com/?q=%5C", "http://example.com/~jane", "http://example.com/a/b", "http://example.com:8080/", "http://*****:*****@example.com/", "ftp://ftp.is.co.za/rfc/rfc1808.txt", "http://www.ietf.org/rfc/rfc2396.txt" ] self.assertEqual( URL(alreadyCanonicalForm[0]).normalized, alreadyCanonicalForm[0]) self.assertEqual( URL(alreadyCanonicalForm[1]).normalized, alreadyCanonicalForm[1]) self.assertEqual( URL(alreadyCanonicalForm[2]).normalized, alreadyCanonicalForm[2]) self.assertEqual( URL(alreadyCanonicalForm[3]).normalized, alreadyCanonicalForm[3]) self.assertEqual( URL(alreadyCanonicalForm[4]).normalized, alreadyCanonicalForm[4]) self.assertEqual( URL(alreadyCanonicalForm[5]).normalized, alreadyCanonicalForm[5]) self.assertEqual( URL(alreadyCanonicalForm[6]).normalized, alreadyCanonicalForm[6]) self.assertEqual( URL(alreadyCanonicalForm[7]).normalized, alreadyCanonicalForm[7]) self.assertEqual( URL(alreadyCanonicalForm[8]).normalized, alreadyCanonicalForm[8]) self.assertEqual( URL(alreadyCanonicalForm[9]).normalized, alreadyCanonicalForm[9])
def set_url(self, url): if isinstance(url, URL): self.url = str(url) self.url_split = url else: self.url = url self.url_split = URL(self.url)
def _get_urls(self): urls = [] for element in self._get_elements('Url'): template = element.getAttribute('template') type = element.getAttribute('type') if template and type: url = URL() url.template = template url.type = type urls.append(url) return urls
class TestSettings(unittest.TestCase): def test_defaults(self): self.url = URL('front1.example.co.uk', useDefaults=True) self.assertEqual(self.url.url, 'http://front1.example.co.uk:80/') def test_file_ext_optional(self): self.url = URL('example.com/path/to/index', fileExtensionOptional=True) self.assertEqual(self.url.path, '/path/to/index') self.url.move_up_level() self.assertEqual(self.url.path, '/path/')
def test_validator(self): url = URL('www.google.com') self.assertFalse(url.isValid()) url = URL('google.com') self.assertFalse(url.isValid()) url = URL('https://www.google.com/googleplus/whousesthis') self.assertTrue(url.isValid()) url = URL('https://www.google.com/googleplus/whousesthis?true=false¬=really') self.assertTrue(url.isValid()) url = URL('https://www.google.com/') self.assertTrue(url.isValid())
def fetch_url(protocol, hostname, port, useragent, timeout, retries, storage=[]): url_obj = URL(protocol, hostname, str(port), useragent, timeout, retries) # define url object url_obj.getdata() storage.append(url_obj.result)
def runTest(self): url1 = URL(value1) url2 = URL(value2) if comp == '==': assert (url1 == url2) == expected, \ (expected, comp, value1, value2) elif comp == '>': assert (url1 > url2) == expected, \ (expected, comp, value1, value2) elif comp == '<': assert (url1 < url2) == expected, \ (expected, comp, value1, value2)
def test_canonicalizer(self): url = URL('www.google.com').getNormalized() self.assertEqual(url, 'http://www.google.com/') url = URL('google.com').getNormalized() self.assertEqual(url, 'http://google.com/') url = URL( 'https://www.google.com/googleplus/whousesthis').getNormalized() self.assertEqual(url, 'https://www.google.com/googleplus/whousesthis') url = URL( 'https://www.google.com/googleplus/whousesthis?true=false¬=really' ).getNormalized() self.assertEqual( url, 'https://www.google.com/googleplus/whousesthis?true=false¬=really' )
def login(self, email, password, **kwargs): """ 登录需要的验证码会保存在当前目录,需要用户自己识别,并输入 """ request_body = { 'email': email, 'password': password, '_xsrf': self._get_xsrf(**kwargs), "captcha": self._get_captcha(**kwargs), 'remember_me': 'true' } response = self._session.post(URL.login(), data=request_body, **kwargs) if response.ok: data = response.json() if data.get("r") == 0: # 登录成功' self._session.cookies.save() self.logger.info("登录成功") return True else: self.logger.info("登录失败, %s" % data.get("msg")) else: self.logger.error(response.content) return False
def send_message(self, content, user_id=None, profile_url=None, user_slug=None, **kwargs): """ 给指定的用户发私信 :param content 私信内容 :param user_id 用户id :param profile_url :用户主页地址 :param user_slug : 用户的个性域名 >>> send_message(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi") >>> send_message(user_slug = "xiaoxiaodouzi") >>> send_message(user_id = "1da75b85900e00adb072e91c56fd9149") """ if not any([user_id, profile_url, user_slug]): raise ZhihuError("至少指定一个关键字参数") if user_id is None: user_slug = self._user_slug( profile_url) if user_slug is None else user_slug user_id = self._user_id(user_slug) data = {"type": "common", "content": content, "receiver_hash": user_id} response = self._session.post(URL.message(), json=data, **kwargs) data = response.json() if data.get("error"): self.logger.info("私信发送失败, %s" % data.get("error").get("message")) else: self.logger.info("发送成功") return data
def user(self, user_slug=None, profile_url=None): """ 获取用户信息 :param user_slug : 用户的个性域名 :param profile_url: 用户主页地址 :return:dict >>> user(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi") >>> user(user_slug = "xiaoxiaodouzi") """ if not any([profile_url, user_slug]): raise Exception("至少指定一个关键字参数") if not user_slug and profile_url: pattern = re.compile("https?://www.zhihu.com/people/([\w-]+)") match = pattern.search(profile_url) if match: user_slug = match.group(1) response = self._session.get(URL.profile(user_slug)) if response.ok: return response.json() else: self.logger.error(u"获取用户信息失败, status code: %s" % response.status_code)
def _get_captcha(self, **kwargs): t = str(int(time.time() * 1000)) r = self._session.get(URL.captcha(t), **kwargs) with open('captcha.jpg', 'wb') as f: f.write(r.content) captcha = input("验证码:") return captcha
def test_eq(self): url1 = URL('www.google.com'); url2 = URL('www.google.com/'); self.assertTrue(url2.__eq__(url1)) url1 = URL('google.com'); url2 = URL('www.google.com/'); self.assertFalse(url2.__eq__(url1)) url1 = URL('www.google.com/hello'); url2 = URL('www.google.com/'); self.assertFalse(url2.__eq__(url1)) url1 = URL('https://www.google.com/'); url2 = URL('http://www.google.com/'); self.assertFalse(url2.__eq__(url1)) url1 = URL('google.com'); url2 = URL('google.com'); self.assertTrue(url2.__eq__(url1))
def process_header(self, ctx: Context): host_url = URL(ctx.header.host) cfg = self.cfg_by_domain(host_url.host) ctx.cfg = cfg if cfg.get('x-forward-for'): ctx.header.args['X-Forwarded-For'] = ctx.src_addr[0] if cfg.get('-'): for arg in cfg['-']: utils.del_key(ctx.header.args, arg)
def __process(self, task): """ Collect urls from the web page. """ task_url = URL(task) soup = bs(self.__response.content, "lxml") # Get all links from current page.Remove duplicated links. url_set = set( item.get("href") for item in soup.find_all(lambda tag: tag.get( "href") and "javascript" not in tag.get("href"))) target = set() # Construct new target links for item in url_set: u = URL(item) if u.netloc and u.netloc not in Worker.Worker_basedomain: continue else: u.standardize(task_url.url) std_url = u.url target.add(std_url) return target
def config(self, **config): self._config = config for key, value in config.items(): self.__dict__[key] = value if not self.path: raise ValueError("Path is Missing") if not self.urls: raise ValueError("Uri is Missing") self._urls = [URL(url) for url in self.urls] self.init_service()
def follow(self, user_slug=None, profile_url=None, **kwargs): """ 关注用户 :param user_slug: :param profile_url: :return: {"follower_count": int} >>> follow(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi") >>> follow(user_slug = "xiaoxiaodouzi") """ if not any([profile_url, user_slug]): raise ZhihuError("至少指定一个关键字参数") user_slug = self._user_slug( profile_url) if user_slug is None else user_slug logging.info(URL.follow(user_slug)) response = self._session.post(URL.follow(user_slug), **kwargs) logging.info(response.text) if response.ok: return response.json() else: self.logger.error(u"关注失败, status code: %s" % response.status_code)
def setupURLs(self): self.urls = [] origins = self.getOrigins() destination = self.destinations[0] for origin in origins: startdate = self.getStartDate() enddate = self.getEndDate() url = URL({ "origin": origin, "destination": destination, "date1": startdate, "date2": enddate }) self.urls.append(url)
def test_gt(self): url1 = URL('www.google.com'); url2 = URL('www.google.com/hello/world'); self.assertTrue(url2.__gt__(url1)) url1 = URL('www.google.com'); url2 = URL('www.google.com/?hello=world'); self.assertTrue(url2.__gt__(url1)) url1 = URL('www.google.com/a'); url2 = URL('www.google.com/b/oh'); self.assertTrue(url2.__gt__(url1)) url1 = URL('www.foogle.com/what'); url2 = URL('www.google.com/'); self.assertTrue(url2.__gt__(url1))
def construct_mongo_url(): """ Construct URL for connecting to MongoDB.""" url = URL('') if MONGO_CONFIG.host in ['127.0.0.1', 'localhost']: url.scheme = 'mongodb' url.netloc = f'{MONGO_CONFIG.host}:{MONGO_CONFIG.port}' else: url.scheme = 'mongodb+srv' url.netloc = f'{MONGO_CONFIG.username}:{MONGO_CONFIG.password}@{MONGO_CONFIG.host}' url.path = MONGO_CONFIG.database url.query_param.set('retryWrites', 'true') url.query_param.set('w', 'majority') return str(url)
def url_to_X(url_name, features_names_file): METHOD = 'Selenium' # 'Selenium' or 'urllib2' UA = 'PhantomJS' # 'firefox' or None # PhantomJS requires command: phantomjs --webdriver 28042 f_names = joblib.load(features_names_file) n = len(f_names) X = [] u = URL(url_name) u.process(user_agent = UA, method = METHOD, to_reload = True, collection = None) static_features_url = u.static_features dynamic_features_url = u.dynamic_features features_url = {} features_url.update(static_features_url) features_url.update(dynamic_features_url) x = [0]*n for i in xrange(n): try: x[i] = features_url[f_names[i]] except: pass X.append(x) return X
def main(): # system prompt if len(sys.argv) is not 2: print 'use: python main.py <input-file>\n' exit(1) # read input file f = open(sys.argv[1]) lines = readfile(f) # create URL objects urls = [URL(x) for x in lines] # count occurrences of original and canonical URLS dic_source = {} dic_canonical = {} for url in urls: if url.getURL() in dic_source: dic_source[url.getURL()] = dic_source[url.getURL()] + 1 else: dic_source[url.getURL()] = 1 if url.getNormalized() in dic_canonical: dic_canonical[ url.getNormalized()] = dic_canonical[url.getNormalized()] + 1 else: dic_canonical[url.getNormalized()] = 1 # print properties of each URL read for url in urls: print "Source: " + url.getURL() if url.isValid(): print "Valid: True" else: print "Valid: False" print "Canonical: " + url.getNormalized() if dic_source[url.getURL()] == 1: print "Source unique: True" else: print "Source unique: False" if dic_canonical[url.getNormalized()] == 1: print "Canonicalized URL unique: True\n" else: print "Canonicalized URL unique: False\n"
def forward(self, ctx): up_url = URL(ctx.cfg['upstream']) if up_url.protocol == Protocol.unix: right_conn = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) right_conn.connect(up_url.host) else: right_conn = socket.socket() right_conn.connect((up_url.host, up_url.port)) right_conn.sendall(ctx.header.encode()) t = spawn(self.relay, (ctx.left_conn, right_conn)) self.relay(right_conn, ctx.left_conn) t.join() ctx.left_conn.close() right_conn.close()
def __init__(self): self.db = Database() self.nlpir = PyNLPIR(self) self.renren = Renren(self) self.url = URL(self) self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0" self.pos_blacklist_regexs = [ "^emoticon$", "^title$", "^ude.", "^w.*", "^vshi", "^vyou", "^p.*", "^ule", "^m.*", "^cc", "^session$", ]
def __make_request(self, task): """ Send request to target. :return: None """ self.__response = None try: self.__response = self.__session.get(task) except requests.exceptions.HTTPError as e: # If the remote server return a response with 4xx or 5xx code, # requests will raise a HTTPError self.__response = e.response except requests.exceptions.RequestException as e: msg = "{url} - {error}".format(url=task.encode("utf-8"), error=e) Worker.Worker_error_logger.warning(msg) Worker.Worker_debug_logger.warning(msg) raise TaskAbort() except Exception as e: # In case of unexpected errors. msg = "{url} - An unexpected error occurred: {err}".format( url=task.encode("utf-8"), err=e) Worker.Worker_sys_logger.exception(msg) Worker.Worker_debug_logger.exception(msg) raise TaskAbort() else: # If we are redirected to a domain that not in basedomain , # ignore it. if Worker.Worker_basedomain and URL( self.__response.url ).netloc not in Worker.Worker_basedomain: raise TaskAbort() else: # Deal with http errors if 400 <= self.__response.status_code <= 599: msg = "{url} - {status}".format( url=task.encode("utf-8"), status=self.__response.status_code) Worker.Worker_error_logger.warning(msg) Worker.Worker_debug_logger.warning(msg) raise TaskAbort()
def send_message(self, content, user_id=None, profile_url=None, user_slug=None): """ 给指定的用户发私信 :param content 私信内容 :param user_id 用户id :param profile_url :用户主页地址 :param user_slug : 用户的个性域名 >>> send_message(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi") >>> send_message(user_slug = "xiaoxiaodouzi") >>> send_message(user_id = "1da75b85900e00adb072e91c56fd9149") """ if not any([user_id, profile_url, user_slug]): raise Exception("至少指定一个关键字参数") if not user_id and user_slug: profile = self.user(user_slug) user_id = profile.get("id") elif not user_id and profile_url: pattern = re.compile("https?://www.zhihu.com/people/([\w-]+)") match = pattern.search(profile_url) if match: user_slug = match.group(1) profile = self.user(user_slug) user_id = profile.get("id") data = {"type": "common", "content": content, "receiver_hash": user_id} response = self._session.post(URL.message(), json=data) data = response.json() if data.get("error"): self.logger.info("私信发送失败, %s" % data.get("error").get("message")) else: self.logger.info("发送成功") return data
def send_subscription_confirmation(cls, email: str, subs_lst: List[DB.EmailSubscription]): """ Send the email for confirmation of email subscription.""" confirmation_url = URL( f'https://{FRONTEND_BASE_URI}/visa/email/subscription') confirmation_url.query_param.set('email', email) for visa_type, code, till in subs_lst: confirmation_url.query_param.append('visa_type', visa_type.value) confirmation_url.query_param.append('code', code.value) confirmation_url.query_param.append('till', till) subscription_str = '<ul>\n{}\n</ul>'.format('\n'.join([ '<li>{} Visa at {} till {}.</li>'.format( VISA_TYPE_DETAILS[vt], next((e.name_en for e in USEmbassy.get_embassy_lst() if e.code == ec), 'None'), tl.strftime('%Y/%m/%d') if tl != datetime.max else 'FOREVER', ) for vt, ec, tl in subs_lst ])) content = SUBSCRIPTION_CONFIRMATION_CONTENT.format( user=email.split('@')[0], email=email, subscription_str=subscription_str, confirmation_url=confirmation_url, ) for _ in range(10): # for robust sent = cls.send_email( title=SUBSCRIPTION_CONFIRMATION_TITLE.format(email=email), content=content, receivers=[email]) if sent: break else: sent = False return sent
def main(): filename = None if len(sys.argv) is not 2: print 'Usage: python main.py input-file' exit(1) inputfile = open(sys.argv[1]) strings = get_strings(inputfile) urls = [URL(x) for x in strings] normalized = {} original = {} for url in urls: if url.getURL() in original: original[url.getURL()] = original[url.getURL()] + 1 else: original[url.getURL()] = 1 if url.getNormalized() in normalized: normalized[ url.getNormalized()] = normalized[url.getNormalized()] + 1 else: normalized[url.getNormalized()] = 1 for url in urls: print "Source: " + url.getURL() if url.isValid(): print "Valid: True" else: print "Valid: False" print "Canonical: " + url.getNormalized() if original[url.getURL()] == 1: print "Source unique: True" else: print "Source unique: False" if normalized[url.getNormalized()] == 1: print "Canonicalized unique: True" else: print "Canonicalized unique: False"
def user(self, user_slug=None, profile_url=None, **kwargs): """ 获取用户信息 :param user_slug : 用户的个性域名 :param profile_url: 用户主页地址 :return:dict >>> user(profile_url = "https://www.zhihu.com/people/xiaoxiaodouzi") >>> user(user_slug = "xiaoxiaodouzi") """ if not any([profile_url, user_slug]): raise ZhihuError("至少指定一个关键字参数") user_slug = self._user_slug( profile_url) if user_slug is None else user_slug response = self._session.get(URL.profile(user_slug), **kwargs) if response.ok: return response.json() else: self.logger.error(u"获取用户信息失败, status code: %s" % response.status_code)
def test_defaults(self): self.url = URL('front1.example.co.uk', useDefaults=True) self.assertEqual(self.url.url, 'http://front1.example.co.uk:80/')
class CompatRequest(object): """ urllib / cookielib compatible request class. See also: http://docs.python.org/library/cookielib.html """ def __init__(self, url, method='GET', headers=None, payload=None): self.set_url(url) self.original_host = self.url_split.host self.method = method self.headers = headers self.payload = payload def set_url(self, url): if isinstance(url, URL): self.url = str(url) self.url_split = url else: self.url = url self.url_split = URL(self.url) def get_full_url(self): return self.url def get_host(self): return self.url_split.host def get_type(self): return self.url_split.scheme def get_origin_req_host(self): return self.original_host def is_unverifiable(self): """ See http://tools.ietf.org/html/rfc2965.html. Not fully implemented! """ return False def get_header(self, header_name, default=None): return self.headers.get(header_name, default) def has_header(self, header_name): return header_name in self.headers def header_items(self): return self.headers.items() def add_unredirected_header(self, key, val): self.headers.add(key, val) def _drop_payload(self): self.method = 'GET' self.payload = None for item in ('content-length', 'content-type', 'content-encoding'): self.headers.discard(item) def _drop_cookies(self): for item in ('cookie', 'cookie2'): self.headers.discard(item) def redirect(self, code, location): """ Modify the request inplace to point to the new location """ self.set_url(self.url_split.redirect(location)) if code in (302, 303): self._drop_payload() self._drop_cookies()
def test_file_ext_optional(self): self.url = URL('example.com/path/to/index', fileExtensionOptional=True) self.assertEqual(self.url.path, '/path/to/index') self.url.move_up_level() self.assertEqual(self.url.path, '/path/')
from url import URL def get_strings(f): strings = [] line = f.readline() while len(line) > 0: # do not add empty line if len(line) > 1: strings.append(line[:len(line) - 1]) line = f.readline() return strings if __name__ == "__main__": filename = None if len(sys.argv) not in (3,3): print 'Usage: python main.py input-file output-file' exit(1) inputfile = open(sys.argv[1]) strings = get_strings(inputfile) urls = [URL(x) for x in strings] outputfile = open(sys.argv[2], 'w+') results = algorithms.analyzeURLs(urls) for item in results: outputfile.write('%s\n' % item) inputfile.close() outputfile.close()
class TestUrlMethods(unittest.TestCase): def setUp(self): self.url = URL('sub.example.co.uk/path/to/file.ext?query=parameter&foo=bar') def test_set_path(self): self.url.path = 'path/to/file.ext' self.assertEqual(self.url.path, '/path/to/file.ext') def test_set_basename(self): self.url.basename = 'newfile.ext' self.assertEqual(self.url.path, '/path/to/newfile.ext') def test_get_query(self): self.assertEqual(self.url.get_query(), 'query=parameter&foo=bar') def test_get_single_query(self): self.assertEqual(self.url.get_query('foo'), 'bar') def test_update_query(self): self.url.update_query('biz', 'bazz') self.assertEqual(self.url.get_query('biz'), 'bazz') def test_overwrite_query(self): self.url.update_query('biz', 'booz') self.assertEqual(self.url.get_query('biz'), 'booz') def test_return_updated_query(self): self.url.update_query('biz', 'booz') self.assertEqual(self.url.query, 'query=parameter&foo=bar&biz=booz') def test_is_subdomain_of(self): self.assertEqual(self.url.is_subdomain_of('example.co.uk'), True) def test_is_sub_subdomain_of(self): self.url = URL('http://dev.front1.example.co.uk') self.assertEqual(self.url.is_subdomain_of('front1.example.co.uk'), True) def test_is_parent_domain_of(self): self.assertEqual(self.url.is_parent_domain_of('dev1.sub.example.co.uk'), True) def test_move_up_level(self): self.url.move_up_level() self.assertEqual(self.url.path, '/path/') def test_move_up_to_top_level(self): self.url.move_up_level() self.url.move_up_level() self.url.move_up_level() self.assertEqual(self.url.path, '/') def test_validate(self): self.assertEqual(self.url.validate(self.url.url), True) def test_validate_fails(self): self.assertEqual(self.url.validate('h://test'), False)
def setUp(self): self.url = URL('sub.example.co.uk/path/to/file.ext?query=parameter&foo=bar')
class Request: # A list of all non-top-level methods including network and site-specific ones # This list is needed because __getattr__ needs to differentiate between methods and parameters # Note: 'create' is omitted here because it duplicates functionality found in the Filter class _methods = ['add', 'advanced', 'answers', 'associated', 'badges', 'comments', 'de-authenticate', 'delete', 'edit', 'elected', 'faq', 'favorites', 'featured', 'full', 'inbox', 'info', 'invalidate', 'linked', 'mentioned', 'merges', 'moderator-only', 'moderators', 'name', 'no-answers', 'notifications', 'privileges', 'questions', 'recipients', 'related', 'reputation', 'reputation-history', 'required', 'revisions', 'suggested-edits', 'synonyms', 'tags', 'timeline', 'top-answer-tags', 'top-answerers', 'top-answers', 'top-askers', 'top-question-tags', 'top-questions', 'unaccepted', 'unanswered', 'unread', 'wikis', 'write-permissions',] # The presence of any of these methods will force all parameters to be # passed as POST parameters instead of with GET. _post_methods = ['add', 'delete', 'edit',] ## Creates a request object. # @param url the domain name to initialize the URL to or a URL instance # @param method a method name to append to the URL # @param response_type an optional type to use for returning the response def __init__(self, url=None, method=None, response_type=Item): self._url = URL(url) if isinstance(url, basestring) else url if not method is None: self._url.add_method(method) self._response_type = response_type self._data = None ## Provides a way to specify IDs. # @param items either a single item or a list/tuple of items def __call__(self, items): self._url.add_method(self._string_list(items), True) return self ## Appends the specified item to the appropriate part of the URL. # @param raw_item the item to be added # # Note: any underscores in the item name are converted to dashes. def __getattr__(self, raw_item): # access_token is a singular exception to this rule item = raw_item if raw_item == 'access_token' else raw_item.replace('_', '-') # No matter what, we're going to be modifying the URL, so make # a deep copy of it url = deepcopy(self._url) if item in self._methods: if item in self._post_methods: url.switch_to_post() return Request(url, item) else: # This is a neat trick - we return a local function that will # finish setting the parameter in the URL once the user provides # the value for the specified parameter (which may be a list). def set_parameter(value): url.add_parameter(item, self._string_list(value)) return Request(url) return set_parameter ## Retrieves the item or data at the specified index and returns it. # @param index the index to retrieve the item / data from # @return the item / data at the specified index # # This method serves a dual purpose - if supplied with an integer value it # will return the item at such an index. If however, supplied with a string, # it will return the appropriate value from the response. For example, given # the value 'total', it will return the total number of items in the set. def __getitem__(self, index): return self._fetch()['items'][index] if type(index) == int else self._fetch()[index] ## Provides a means of iterating through the response. # @return an iterator for the response def __iter__(self): return iter(self._fetch()['items']) ## Returns the total number of items in the response. # @return the number of items in the response def __len__(self): return len(self._fetch()['items']) ## Returns an internal representation of the current instance. # @return the internal representation def __repr__(self): return "<Request '%s'>" % self._url ## Either fetches the data for the request or returns the data. # @return the data for the request def _fetch(self): if self._data is None: # Fetch the data and replace the 'items' entry with initialized response objects self._data = self._url.fetch() if self._url.base_method() in METHOD_TO_TYPE_MAPPING: item_type = METHOD_TO_TYPE_MAPPING[self._url.base_method()] else: item_type = self._data['type'] if 'type' in self._data else '' self._data['items'] = [self._response_type(i, item_type) for i in self._data['items']] return self._data ## Converts the provided item or list of items into a string. # @param items the list of items to join # @return a string with the items joined together def _string_list(self, items): # Ensure that items is iterable - if not, put it in a list try: # Trigger the TypeError exception if this object is a string # so that it isn't treated like a list if isinstance(items, basestring): raise TypeError iter(items) except (KeyError, TypeError): items = [items,] return ';'.join([str(i.id() if issubclass(i.__class__, Item) else i) for i in items])
def test_canonical_url(self): url = URL(self.canonical) self.assertEqual(url.getCanonicalized(), self.canonical)
class RecBySNS(object): def __init__(self): self.db = Database() self.nlpir = PyNLPIR(self) self.renren = Renren(self) self.url = URL(self) self.UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv: 17.0) Gecko/17.0 Firefox/17.0" self.pos_blacklist_regexs = [ "^emoticon$", "^title$", "^ude.", "^w.*", "^vshi", "^vyou", "^p.*", "^ule", "^m.*", "^cc", "^session$", ] def assign_recbysns_entity_sentiment(self): for status in self.db.select_table( "weibo_status", "text like '%%《%%》%%' or \ text like '%%http://%%' or \ text like '%%https://%%'", 12696, 5, ): sessions = self.nlpir.segment_weibo_status(status["text"]) i = 0 while i < len(sessions): session = sessions[i] entities = [] session_text = "" for segment in session: session_text += segment.rsplit("/", 1)[0] if self.nlpir.get_POS(segment) == "title": title = re.match(u"《(.*?)》/title", segment).group(1) if self.db.select_douban_movie_by_title(title) or self.db.select_douban_book_by_title(title): entities.append(segment) elif self.nlpir.get_POS(segment) == "url": match = re.search(u"(http.*)/url", segment) if match is None: print "###########%s###########" % segment continue url = match.group(1) url = self.db.select_recbysns_url_by_short_url(url) if url is None: print "***********%s***********" % segment continue if self.url.is_video_url(url["origin_url"]): entities.append(segment) positions = {} for entity in entities: if entity in positions: position = positions[entity] + 1 positions[entity] = position else: position = 0 positions[entity] = position print status["text"] print session_text print entity print "Type:" type = int(sys.stdin.readline()) print "Sentiment:" sentiment = int(sys.stdin.readline()) self.db.query( "INSERT INTO recbysns_entity( \ entity, status_id, session, position, \ type, score) \ VALUES(%s, %s, %s, %s, %s, %s)", (entity, status["id"], i, position, type, sentiment), ) self.db.commit() i = i + 1 def is_blacklist_word(self, word): for pos_blacklist_regex in self.pos_blacklist_regexs: if re.search(pos_blacklist_regex, self.nlpir.get_POS(word)): return True return False
def test_invalid(self): url = URL(self.invalid) self.assertFalse(url.isValid())
def test_valid_not_canonical(self): url = URL(self.notcanonical) self.assertTrue(url.isValid())
def main(): url1 = URL("example.com") url2 = URL(" ") url3 = URL("http://example.com/") url4 = URL("http://www.example.com") url5 = URL("http://z.com/") url6 = URL("http://example.com alsdfkj") url7 = URL("http://example.com ()") url8 = URL("google.com") #testing validity if not url1.isValid(): print "Pass first test" else: print "Failed first test" if not url2.isValid(): print "Pass second test" else: print "Failed second test" if url3.isValid(): print "Pass third test" else: print "Failed third test" if not url4.isValid(): print "Pass fourth test" else: print "Failed fourth test" if url5.isValid(): print "Pass fifth test" else: print "Failed fifth test" if not url6.isValid(): print "Pass sixth test" else: print "Failed sixth test" if not url7.isValid(): print "Pass seventh test" else: print "Failed seventh test" if url3 == url3: print "Pass eighth test" else: print "Failed eighth test" if url3 < url5: print "Pass ninth test" else: print "Failed ninth test" if url1 != url3: print "Pass tenth test" else: print "Pass tenth test"
def testValidity(url, expected): if URL(url).isValid() == expected: print "+ Validity PASS" else: print "- Validity FAIL"
def send_unsubscription_confirmation(cls, email: str): """ Send the email for confirmation of email unsubscription. """ subs_lst_by_email = DB.Subscription.get_subscriptions_by_email(email) if len(subs_lst_by_email ) == 0: # If the user has no subscription/email doesn't exist for _ in range(10): sent = cls.send_email( title=UNSUBSCRIPTION_EMPTY_SUBS_TITLE.format(email=email), content=UNSUBSCRIPTION_EMPTY_SUBS_CONTENT.format( user=email.split('@')[0], email=email, base_uri=FRONTEND_BASE_URI), receivers=[email], ) if sent: break else: sent = False return sent unsubs_url = URL( f'https://{FRONTEND_BASE_URI}/visa/email/unsubscription' ) # Unsubscription confirmation url unsubs_url.query_param.set('email', email) unsubs_all_url = unsubs_url.copy() unsubs_info = [] for subs in subs_lst_by_email: url = unsubs_url.copy() url.query_param.set('visa_type', subs['visa_type']) url.query_param.set('code', subs['embassy_code']) url.query_param.set('till', subs['till']) unsubs_info.append((subs['visa_type'], subs['embassy_code'], subs['till'], subs['expired'], url)) unsubs_all_url.query_param.append('visa_type', subs['visa_type']) unsubs_all_url.query_param.append('code', subs['embassy_code']) unsubs_all_url.query_param.append('till', subs['till']) unsubscription_str = '{}'.format('\n'.join([ '<li>{} Visa at {} {} on {}: click <a href="{}">this link</a> to unsubscribe.</li>' .format( VISA_TYPE_DETAILS[vt], next((e.name_en for e in USEmbassy.get_embassy_lst() if e.code == ec), 'None'), 'expired' if exp else 'expiring', tl.strftime('%Y/%m/%d') if tl.year < 9999 else 'FOREVER', url, ) for vt, ec, tl, exp, url in unsubs_info ])) content = UNSUBSCRIPTION_CONFIRMATION_CONTENT.format( user=email.split('@')[0], email=email, unsubscription_str=unsubscription_str, unsubscribe_all_url=unsubs_all_url, ) for _ in range(10): sent = cls.send_email(title=UNSUBSCRIPTION_CONFIRMATION_TITLE, content=content, receivers=[email]) if sent: break else: sent = False return sent
def __init__(self, url=None, method=None, response_type=Item): self._url = URL(url) if isinstance(url, basestring) else url if not method is None: self._url.add_method(method) self._response_type = response_type self._data = None
for row in csvreader: legit_urls.append(row[0]) with open(more_legit_url_data, 'r', encoding='utf8') as more_legit_file: csvreader = csv.reader(more_legit_file) data_list = list(csvreader) for row in data_list[:50000]: legit_urls.append(row[1]) num_legit_urls = len(legit_urls) num_phish_urls = len( phish_urls) if len(phish_urls) <= len(legit_urls) else len(legit_urls) print(f'num legit: {num_legit_urls}') print(f'num phish: {num_phish_urls}') # iterate through urls, making url objects print('setting up urls') url_objs = [URL(u, 0).to_json() for u in phish_urls[:num_phish_urls]] + \ [URL(u, 1).to_json() for u in legit_urls[:num_legit_urls]] # bulk save them into mongodb databases print('inserting urls') new_result = urls.insert_many(url_objs) print(f'Number of inserts: {len(new_result.inserted_ids)}') # Disconnect from MongoDB client.close()
def has_new_features_to_add(url_name, collection): u = URL(url_name) feat_names_url = u.get_feature_names() feat_names_db = get_features_names(collection) res = sorted(feat_names_url['All']) == sorted(feat_names_db) return res
def test_is_sub_subdomain_of(self): self.url = URL('http://dev.front1.example.co.uk') self.assertEqual(self.url.is_subdomain_of('front1.example.co.uk'), True)