def test_decode(self): for enc, dec in [('http://%D0%BF%D1%8C%D0%B5%D1%[email protected]/' '%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8', 'http://пьер@local.com/записи'), ('/%2525', '/%25')]: self.assertEqual(URL(enc).decode()._data, URL(dec)._data) self.assertEqual(URL(enc).decode().as_string(), dec) self.assertEqual(URL(enc).decode().decode().as_string(), dec)
def one_try(self, url, scheme='', host='', path='', query='', fragment='', userinfo='', port='', invalid=None, urlsplit=True): orih_url = url url = URL(url) splitted = (scheme, userinfo, host, port, path, query, fragment) self.assertEqual(url._data, splitted) self.assertEqual(URL(None, *splitted)._data, splitted) self.assertEqual(URL(None, *url._data)._data, splitted) if invalid: self.assertRaises(invalid, url.validate) else: url.validate() if urlsplit and '-v' in sys.argv: splitted = (scheme, url.authority, path, query, fragment) split_result = self.split(orih_url) if split_result != splitted: print('\n urllib issue:', orih_url, self.split(orih_url)) elif (split_result.hostname or '') != host: print('\n urllib issue:', orih_url, 'host is:', split_result.hostname, 'host should:', host)
def test_username_and_authorization(self): for userinfo, un, az in [('user', 'user', ''), ('user:'******'user', ''), ('user:pass', 'user', 'pass'), ('user:pass:buzz', 'user', 'pass:buzz'), (':pass', '', 'pass'), (':pass:buzz', '', 'pass:buzz'), ('', '', ''), (':', '', ''), ('::', '', ':')]: self.assertEqual(URL(userinfo=userinfo).username, un) self.assertEqual(URL(userinfo=userinfo).authorization, az)
def test_pickling(self): import pickle dump = pickle.dumps(URL('a://*****:*****@d:5/f?g#h')) self.assertEqual(pickle.loads(dump), URL('a://*****:*****@d:5/f?g#h')) global _test_picklingURL class _test_picklingURL(URL): def __new__(cls, path): return super(_test_picklingURL, cls).__new__(cls, path) url = _test_picklingURL('a://*****:*****@d:5/f?g#h') self.assertEqual(pickle.loads(pickle.dumps(url)), url) self.assertEqual(type(pickle.loads(pickle.dumps(url))), type(url))
def test_setdefault(self): empty = URL() full1 = URL('scheme://user@host:80/path?query#frgment') full2 = URL('an://oth@er:33/full?url#!!') self.assertEqual(empty.setdefault(*full1._data), full1) self.assertEqual(full1.setdefault(*full2._data), full1) for idx, (field, value) in enumerate(zip(full1._fields, full1._data)): self.assertEqual(empty.setdefault(**{field: value}), empty.replace(**{field: value})) self.assertEqual(empty.setdefault(**{field: value})[idx], value) self.assertEqual( full2.setdefault(**{field: value})[idx], full2[idx])
def __parse_urls(self, row, url_params, job_name, job_id=None): for u in row['urls']: try: parsed = URL(u['expanded_url']) url_params.append({ 'tweet_id': row['status_id'], 'url': u['expanded_url'], 'job_id': job_id, 'job_name': job_name, 'schema': parsed.scheme, 'netloc': parsed.authority, 'path': parsed.path, 'params': '', #parsed.params, 'query': parsed.query, 'fragment': parsed.fragment, 'username': parsed.username, 'password': parsed.authorization, 'hostname': parsed.host, 'port': parsed.port, }) except Exception as inst: logging.error(type(inst)) # the exception instance logging.error(inst.args) # arguments stored in .args # __str__ allows args to be printed directly, logging.error(inst) return url_params
def _parse_post(self): self.id = self._post['id'] self.type = self._post['type'] self.url = URL(self._post['post_url']) if 'post_url' in self._post else None self.tags = set(self._post.get('tags', [])) self.note_count = self._post.get('note_count') self.post_date = self._post['date']
def _api_parse_response(self): """ Parse an API response """ blog = self._api_response.json()['response']['blog'] self.title = blog['title'] self.url = URL(blog['url']) self.name = blog['name'] self.description = blog['description'] self.is_nsfw = blog['is_nsfw'] self.likes = blog.get( 'likes', False) # Returned only if sharing of likes is enabled self.post_count = blog['posts'] self.updated = blog['updated'] posts = self._api_response.json()['response']['posts'] for post in posts: try: if post['type'] in ['photo', 'link']: self._posts.append(TumblrPhotoSet(post, self)) continue elif post['type'] == 'video': self._posts.append(TumblrVideoPost(post, self)) continue self._posts.append(TumblrPost(post, self)) except TumdlrParserError: continue
def _parse_result(self): """ Parse search result data. Raises: PoogleParserError: Raised if the result can not be parsed for any reason """ self.title = self._soup.a.text self._log.info('Result title parsed: %s', self.title) # Make sure this is a valid result URL (and not a link to image results, as an example). href = self._soup.a.get('href') if not href.startswith('/url?'): raise PoogleParserError('Unrecognized URL format: %s', href) match = self.url_regex.match(href) if not match or not match.group('url'): self._log.error( 'Unable to parse search result URL: {h}'.format(h=href)) raise PoogleParserError('Unable to parse search result URL: %s', href) url = unquote(match.group('url')) self.url = URL(url) self._log.info('Result URL parsed: %s', self.url)
def __init__(self, url, session=None, **kwargs): """ Tumblr blog Args: url(URL|str): Tumblr profile URL session(Optional[Session]): An optional custom Requests session Keyword Args: api_key(str): Tumblr API key uagent(str): Custom User-Agent header """ self._url = url if isinstance(url, URL) else URL(url) self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/') self._api_response = None # type: Response self._api_key = kwargs.get( 'api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4') self._uagent = kwargs.get('user_agent', 'tumdlr/{version}') if not session: session = Session() session.headers.update({ 'Referer': urllib.parse.quote(self._url.as_string()), 'User-Agent': self._uagent.format(version=__version__) }) self.session = session self.title = None # type: str self.url = None # type: URL self.name = None # type: str self.description = None # type: str self.is_nsfw = None # type: bool self.likes = None # type: int|False self.post_count = None # type: int self.updated = None # type: int self._posts = [] self.offset = 0 self._api_url = self._api_url.replace( path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host)) self._api_get()
def get_url(self, data=None): cfg = self.config service = cfg.services[self.get_name()] dependency = cfg.dependencies[service['dependency']][cfg.env] serv = copy(service) del serv['dependency'] url_options = {**serv, **dependency} return URL(**url_options)
def get_hostname_from_url(url): hostname = "" if _url_enabled: hostname = URL(url).host.strip("[]") else: hostname = urlparse(url).hostname if not hostname: hostname = url.lower() return hostname
def InfluxDB(uri): logging.debug('Parsing uri "{}"'.format(uri)) p = URL(uri) if p.scheme != 'influx': raise WrongSchemeException(uri) return InfluxDBClient(p.host, p.port or 8086, p.username, p.authorization, p.path.lstrip('/'))
def __init__(self, connection_string: str, profile_name: str = 'default'): self.creds = {} self.url = URL(connection_string) self.profile = profile_name self.qs = dict(parse_qsl(self.url.query)) if self._gather_aws_creds(): self._update_url() else: raise InvalidAWSSession
def __init__(self, data, container): """ Args: data(dict): API response data container(TumblrPost): Parent container """ self.log = logging.getLogger('tumdlr.containers.file') self._data = data self.container = container self.url = URL(self._data.get('url', self._data.get('post_url')))
def wrapped_fn(self, url, **kwargs): u = URL(url) # Fix params not appears in referer try: params = kwargs.pop('params') url = '{url}?{params}'.format(url=url, params=urlencode(params)) except KeyError: pass if u.scheme and u.host: # pylint: disable=protected-access self._base_url = str(u.replace(full_path='')) # pylint: disable=protected-access self._scheme = u.scheme # Update scheme if url.startswith(r'//'): # '//example.com' # pylint: disable=protected-access url = '{scheme}:{where}'.format(scheme=self._scheme, where=url) self._base_url = url # pylint: disable=protected-access elif url.startswith(r'?'): # '?page=rss' url = '/' + url # -> '/?page=rss' url = urljoin(self._base_url, url) # pylint: disable=protected-access else: # '/?page=rss' 'page=rss' url = urljoin(self._base_url, url) # pylint: disable=protected-access # pylint: disable=protected-access u = URL(self._last_url) # pylint: disable=protected-access self._headers.update({ # HTTP/2 Headers lowercase only 'origin': str(u.replace(full_path='')), 'referer': self._last_url }) self._last_url = url # pylint: disable=protected-access return fn(self, url, **kwargs)
def test_stress_authority(self): # Authority is most ambiguous part of url. Invalid host can contatin # ':' and '@' (path for example can not contain '?'. And query # can not contain '#'). The host '//no:99:' will be parsed as 'no:99' # and in next recomposition it can be written as '//no:99'. But parsing # of '//no:99:' and '//no:99' will be different. # # case generation: # from re import sub # from itertools import permutations # cases = set(sub('\d+', '7', ''.join(case)) # for case in set(permutations('::@@77777'))) cases = """7:7:7@7@ 7:7:7@7@7 7:7:7@@ 7:7:7@@7 7:7:@7@ 7:7:@7@7 7:7:@@ 7:7:@@7 7:7@7:7@ 7:7@7:7@7 7:7@7:@ 7:7@7:@7 7:7@7@7: 7:7@7@7:7 7:7@7@: 7:7@7@:7 7:7@:7@ 7:7@:7@7 7:7@:@ 7:7@:@7 7:7@@7: 7:7@@7:7 7:7@@: 7:7@@:7 7::7@7@ 7::7@7@7 7::7@@ 7::7@@7 7::@7@ 7::@7@7 7::@@ 7::@@7 7:@7:7@ 7:@7:7@7 7:@7:@ 7:@7:@7 7:@7@7: 7:@7@7:7 7:@7@: 7:@7@:7 7:@:7@ 7:@:7@7 7:@:@ 7:@:@7 7:@@7: 7:@@7:7 7:@@: 7:@@:7 7@7:7:7@ 7@7:7:7@7 7@7:7:@ 7@7:7:@7 7@7:7@7: 7@7:7@7:7 7@7:7@: 7@7:7@:7 7@7::7@ 7@7::7@7 7@7::@ 7@7::@7 7@7:@7: 7@7:@7:7 7@7:@: 7@7:@:7 7@7@7:7: 7@7@7:7:7 7@7@7:: 7@7@7::7 7@7@:7: 7@7@:7:7 7@7@:: 7@7@::7 7@:7:7@ 7@:7:7@7 7@:7:@ 7@:7:@7 7@:7@7: 7@:7@7:7 7@:7@: 7@:7@:7 7@::7@ 7@::7@7 7@::@ 7@::@7 7@:@7: 7@:@7:7 7@:@: 7@:@:7 7@@7:7: 7@@7:7:7 7@@7:: 7@@7::7 7@@:7: 7@@:7:7 7@@:: 7@@::7 :7:7@7@ :7:7@7@7 :7:7@@ :7:7@@7 :7:@7@ :7:@7@7 :7:@@ :7:@@7 :7@7:7@ :7@7:7@7 :7@7:@ :7@7:@7 :7@7@7: :7@7@7:7 :7@7@: :7@7@:7 :7@:7@ :7@:7@7 :7@:@ :7@:@7 :7@@7: :7@@7:7 :7@@: :7@@:7 ::7@7@ ::7@7@7 ::7@@ ::7@@7 ::@7@ ::@7@7 ::@@7 :@7:7@ :@7:7@7 :@7:@ :@7:@7 :@7@7: :@7@7:7 :@7@: :@7@:7 :@:7@ :@:7@7 :@:@7 :@@7: :@@7:7 :@@:7 @7:7:7@ @7:7:7@7 @7:7:@ @7:7:@7 @7:7@7: @7:7@7:7 @7:7@: @7:7@:7 @7::7@ @7::7@7 @7::@ @7::@7 @7:@7: @7:@7:7 @7:@: @7:@:7 @7@7:7: @7@7:7:7 @7@7:: @7@7::7 @7@:7: @7@:7:7 @7@:: @7@::7 @:7:7@ @:7:7@7 @:7:@ @:7:@7 @:7@7: @:7@7:7 @:7@: @:7@:7 @::7@ @::7@7 @::@7 @:@7: @:@7:7 @:@:7 @@7:7: @@7:7:7 @@7:: @@7::7 @@:7: @@:7:7 @@::7""".split() for case in cases: url = URL('//' + case) # check is all parts defined in original url is defined in parsed self.assertEqual(url, URL(url.as_string())) self.assertEqual(url, URL('//' + url.authority))
def test_test(self): def test_valid(url, relative, relative_path): self.assertEqual(URL(url).is_relative(), relative) self.assertEqual(URL(url).is_relative_path(), relative_path) test_valid('sc:', False, False) test_valid('sc:path/', False, False) test_valid('//host', True, False) test_valid('/path', True, False) test_valid('path/', True, True) test_valid('./path/', True, True) test_valid('?true', True, True) def test_ip(url, host_ip, host_ipv4): self.assertEqual(URL(url).is_host_ip(), host_ip) self.assertEqual(URL(url).is_host_ipv4(), host_ipv4) test_ip('', False, False) test_ip('//google/', False, False) test_ip('//127.0.1', False, False) test_ip('//127.0.0.1', True, True) test_ip('//[127.0.0.1]', True, False) self.assertTrue(URL('/url')) self.assertTrue(URL('url:')) self.assertTrue(URL('//url')) self.assertTrue(URL('?url')) self.assertTrue(URL('#url')) self.assertFalse(URL('//@:?#'))
def __parse_urls_twint(self, df, job_name, job_id): counter = 0 url_params_lst = [] try: for index, row in df.iterrows(): if row["urls"]: urls = [url for url in row["urls"]] parsed = URL(urls[counter]) url_params_lst.append( pd.DataFrame([{ 'tweet_id': int(row["status_id"]), 'full_url': urls[counter], 'job_id': job_id, 'job_name': job_name, 'schema': parsed.scheme, 'netloc': parsed.authority, 'path': parsed.path, 'params': '', #parsed.params, 'query': parsed.query, 'fragment': parsed.fragment, 'username': parsed.username, 'password': parsed.authorization, 'hostname': parsed.host, 'port': parsed.port }])) except Exception as e: logging.error('params.append exn', e) logging.error('row', row) raise e if len(url_params_lst) == 0: return pd.DataFrame({ 'tweet_id': pd.Series([], dtype='int64'), 'full_url': pd.Series([], dtype='object'), 'job_id': pd.Series([], dtype='object'), 'job_name': pd.Series([], dtype='object'), 'schema': pd.Series([], dtype='object'), 'netloc': pd.Series([], dtype='object'), 'path': pd.Series([], dtype='object'), 'params': pd.Series([], dtype='object'), 'query': pd.Series([], dtype='object'), 'fragment': pd.Series([], dtype='object'), 'username': pd.Series([], dtype='object'), 'password': pd.Series([], dtype='object'), 'hostname': pd.Series([], dtype='object'), 'port': pd.Series([], dtype='int64') }) url_df = pd.concat(url_params_lst, ignore_index=True, sort=False) counter += 1 return url_df
def __init__(self, name, credentials, region, qualifier='$LATEST'): """Creates a client of AWS Lambda function with ability to invoke it synchronously by RequestResponse invocation type. By `synchronously` it means, that caller will receive lambda function call result in place. Request to AWS Lambda will be made asynchronously. See http://docs.aws.amazon.com/lambda/latest/dg/API_Invoke.html for details. Usage example: _ioloop = ioloop.IOLoop.instance() @gen.coroutine def async_request(): credentials = Credentials(access_key=<access_key>, secret_key=<secret_key>) payload = {'input_bucket': 'bucket', ...} service = Lambda('some-service', credentials, <region>) result = yield service(payload) _ioloop.stop() _ioloop.add_callback(async_request) _ioloop.start() :param name: Name of the AWS Lambda function. :param credentials: AWS credentials. :param region: AWS Lambda function region. :param qualifier: Lambda function alias or version. """ self.name = name self.region = region self._credentials = credentials self.client = CurlAsyncHTTPClient() if qualifier: query = 'Qualifier={0}'.format(quote(qualifier)) else: query = None self.url = URL(scheme='https', host='lambda.{0}.amazonaws.com'.format(region), path='{0}/functions/{1}/invocations'.format( self.API_VERSION, name), query=query) self.url = str(self.url)
def _init_mqtt(self, uri): log.debug('Parsing uri "{}"'.format(uri)) p = URL(uri) if p.scheme != 'mqtt': raise WrongSchemeException(uri) self.client = mqtt.Client() if p.username: self.client.username_pw_set(p.username, p.authorization) self.client.enable_logger() self.client.connect( p.host, int(p.port) or 1883, )
def normalize_image_url(url): """ takes an s3 url or relative url and returns the part that is saved in the database (relative to the storage root). """ if url.startswith("http://") or url.startswith("https://"): url = URL(url).path bucket = "/{}/".format(timelapse_storage.bucket_name) if url.startswith(bucket): url = url[len(bucket):] if url.startswith(timelapse_storage.location): url = url[len(timelapse_storage.location):] if hasattr(timelapse_storage, "base_url") and url.startswith( timelapse_storage.base_url): url = url[len(timelapse_storage.base_url):] if url.startswith("/"): url = url[1:] return url
def test_constructor(self): # args self.assertEqual(URL('a://*****:*****@d:5/f?g#h'), URL(None, 'a', 'b:c', 'd', '5', '/f', 'g', 'h')) # kwargs self.assertEqual( URL('a://*****:*****@d:5/f?g#h'), URL(scheme='a', userinfo='b:c', host='d', port='5', path='/f', query='g', fragment='h')) # ignore self.assertEqual(URL('//host'), URL('//host', scheme='sh', port='80'))
def __init__(self, base_url, **kwargs): u = URL(base_url) if not (u.scheme and u.host): raise EzReqURLError('Unsupported URL!') # Backup scheme self._scheme = u.scheme self._base_url = base_url self._session = Session() self._last_url = base_url # `self._headers` -> `self._session.headers` self._headers = self._session.headers headers = kwargs.pop('headers', {}) self._session.headers.update(headers) max_retries = kwargs.pop('max_retries', 3) self._session.mount('http://', HTTPAdapter(max_retries=max_retries)) self._session.mount('https://', HTTPAdapter(max_retries=max_retries))
def test_replace(self): for url in [ URL('htttp://[email protected]:8080/path?query#fragment'), URL(), URL('path'), URL('//host:80') ]: self.assertFalse(url is url.replace(host='strange')) self.assertEqual(url, url.replace()) for idx, (field, value) in enumerate(zip(url._fields, url._data)): # replase to same self.assertEqual(url.replace(**{field: value}), url) # clear self.assertEqual(url.replace(**{field: ''})[idx], '') # replace to some if url.has_authority() and field == 'path': self.assertEqual(url.replace(**{field: 'an'})[idx], '/an') else: self.assertEqual(url.replace(**{field: 'an'})[idx], 'an') for url, authority in [(URL('a://*****:*****@d:5/f?g#h'), 'blah'), (URL('a://blah/f?g#h'), '')]: orig_autho = url.authority url = url.replace(authority=authority) self.assertEqual(url.authority, authority) url = url.replace(authority=orig_autho) self.assertEqual(url.authority, orig_autho) for url, full_path in [(URL('a://*****:*****@d:5/f?g#h'), ''), (URL('a://*****:*****@d:5/f?g#h'), '/path'), (URL('a://*****:*****@d:5/f?g#h'), '/path?qr'), (URL('a://*****:*****@d:5/f?g#h'), '?qr'), (URL('a://*****:*****@d:5/f?g#h'), '?qr#fr'), (URL('a://*****:*****@d:5/f?g#h'), '#fr'), (URL('a://*****:*****@d:5'), '/path')]: orig_path = url.full_path url = url.replace(full_path=full_path) self.assertEqual(url.full_path, full_path) url = url.replace(full_path=orig_path) self.assertEqual(url.full_path, orig_path)
def test_authority(self): for url in [ '', 'ya.ru', 'ya.ru:80', ':80', '*****@*****.**', 'info@', 'info@:80' ]: self.assertEqual(URL('//' + url).authority, url)
def test_full_path(self): for url in [ '', 'path', 'path?query', 'path#fragment', 'path?query#fragment', '?query', '#fragment', '?query#fragment' ]: self.assertEqual(URL(url).full_path, url)
def test_ip(url, host_ip, host_ipv4): self.assertEqual(URL(url).is_host_ip(), host_ip) self.assertEqual(URL(url).is_host_ipv4(), host_ipv4)
def test_valid(url, relative, relative_path): self.assertEqual(URL(url).is_relative(), relative) self.assertEqual(URL(url).is_relative_path(), relative_path)
def test_str(self): for url in [ '', '//host', '//host/' 'scheme://host', '//host/path', '?query', 'path?query', 'http:', 'http:?query', '//host?query' ]: self.assertEqual(str(URL(url)), url) self.assertEqual(URL(str(URL(url))), URL(url)) # should append slash to path self.assertEqual(str(URL(host='host', path='path')), '//host/path') self.assertEqual(str(URL(host='host', path='//path')), '//host//path') self.assertEqual(str(URL(path='//path').validate()), '////path') self.assertEqual(str(URL(path='//pa:th').validate()), '////pa:th') self.assertEqual(str(URL(path='pa:th').validate()), './pa:th') self.assertEqual(str(URL(path='not/pa:th').validate()), 'not/pa:th') self.assertEqual(str(URL(path='pa:th/not').validate()), './pa:th/not')