def _parse_post(self): self.id = self._post['id'] self.type = self._post['type'] self.url = URL(self._post['post_url']) if 'post_url' in self._post else None self.tags = set(self._post.get('tags', [])) self.note_count = self._post.get('note_count') self.post_date = self._post['date']
def set_language_and_region(request, region_code="GL", language_code="en"): """ Adapted from the Django's set_language Redirect to a given url while setting the chosen language in the session or cookie. The url and the language code need to be specified in the request parameters, or will be taken from HTTP_REFERER """ next_url = request.POST.get('next', request.GET.get('next')) if not is_safe_url(url=next_url, host=request.get_host()): next_url = request.META.get('HTTP_REFERER') if not is_safe_url(url=next_url, host=request.get_host()): next_url = '/GL/en/' # Default global region with English language. # In case of bogus information fall back to default region or language for that region if exists. region, new_language_code = get_region(region_code, language_code) if new_language_code != language_code: language_code = new_language_code old_path = URL(next_url).path if old_path == "/": new_path = "/%s/" % "/".join([region.code, language_code]) else: new_path = "/" + "/".join([region.code, language_code] + old_path.split("/")[3:]) next_url = URL(next_url).replace(path=new_path) response = http.HttpResponseRedirect(next_url) if hasattr(request, 'session'): request.session[LANGUAGE_SESSION_KEY] = language_code else: response.set_cookie(settings.LANGUAGE_COOKIE_NAME, language_code, max_age=settings.LANGUAGE_COOKIE_AGE, path=settings.LANGUAGE_COOKIE_PATH, domain=settings.LANGUAGE_COOKIE_DOMAIN) return response
def test_decode(self): for enc, dec in [('http://%D0%BF%D1%8C%D0%B5%D1%[email protected]/' '%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8', 'http://пьер@local.com/записи'), ('/%2525', '/%25')]: self.assertEqual(URL(enc).decode()._data, URL(dec)._data) self.assertEqual(URL(enc).decode().as_string(), dec) self.assertEqual(URL(enc).decode().decode().as_string(), dec)
class TumblrFile: """ This is the base container class for all downloadable resources associated with Tumblr posts. """ CATEGORY = 'misc' def __init__(self, data, container): """ Args: data(dict): API response data container(TumblrPost): Parent container """ self.log = logging.getLogger('tumdlr.containers.file') self._data = data self.container = container self.url = URL(self._data.get('url', self._data.get('post_url'))) def download(self, context, **kwargs): """ Args: context(tumdlr.main.Context): CLI request context kwargs(dict): Additional arguments to send with the download request Returns: str: Path to the saved file """ try: download(self.url.as_string(), str(self.filepath(context, kwargs)), **kwargs) except Exception as e: self.log.warn('Post download failed: %r', self, exc_info=e) raise TumdlrDownloadError(error_message=str(e), download_url=self.url.as_string()) def filepath(self, context, request_data): """ Args: context(tumdlr.main.Context): CLI request context request_data(Optional[dict]): Additional arguments to send with the download request Returns: Path """ # Construct the save basedir basedir = Path(context.config['Tumdlr']['SavePath']) # Are we categorizing by user? if context.config['Categorization']['User']: self.log.debug('Categorizing by user: %s', self.container.blog.name) basedir = basedir.joinpath(sanitize_filename(self.container.blog.name)) # Are we categorizing by post type? if context.config['Categorization']['PostType']: self.log.debug('Categorizing by type: %s', self.CATEGORY) basedir = basedir.joinpath(self.CATEGORY) self.log.debug('Basedir constructed: %s', basedir) return basedir
def test_username_and_authorization(self): for userinfo, un, az in [('user', 'user', ''), ('user:'******'user', ''), ('user:pass', 'user', 'pass'), ('user:pass:buzz', 'user', 'pass:buzz'), (':pass', '', 'pass'), (':pass:buzz', '', 'pass:buzz'), ('', '', ''), (':', '', ''), ('::', '', ':')]: self.assertEqual(URL(userinfo=userinfo).username, un) self.assertEqual(URL(userinfo=userinfo).authorization, az)
def __init__(self, connection_string: str, profile_name: str = 'default'): self.creds = {} self.url = URL(connection_string) self.profile = profile_name self.qs = dict(parse_qsl(self.url.query)) if self._gather_aws_creds(): self._update_url() else: raise InvalidAWSSession
def __init__(self, data, container): """ Args: data(dict): API response data container(TumblrPost): Parent container """ self.log = logging.getLogger('tumdlr.containers.file') self._data = data self.container = container self.url = URL(self._data.get('url', self._data.get('post_url')))
def test_pickling(self): import pickle dump = pickle.dumps(URL('a://*****:*****@d:5/f?g#h')) self.assertEqual(pickle.loads(dump), URL('a://*****:*****@d:5/f?g#h')) global _test_picklingURL class _test_picklingURL(URL): def __new__(cls, path): return super(_test_picklingURL, cls).__new__(cls, path) url = _test_picklingURL('a://*****:*****@d:5/f?g#h') self.assertEqual(pickle.loads(pickle.dumps(url)), url) self.assertEqual(type(pickle.loads(pickle.dumps(url))), type(url))
def _api_parse_response(self): """ Parse an API response """ blog = self._api_response.json()['response']['blog'] self.title = blog['title'] self.url = URL(blog['url']) self.name = blog['name'] self.description = blog['description'] self.is_nsfw = blog['is_nsfw'] self.likes = blog.get( 'likes', False) # Returned only if sharing of likes is enabled self.post_count = blog['posts'] self.updated = blog['updated'] posts = self._api_response.json()['response']['posts'] for post in posts: try: if post['type'] in ['photo', 'link']: self._posts.append(TumblrPhotoSet(post, self)) continue elif post['type'] == 'video': self._posts.append(TumblrVideoPost(post, self)) continue self._posts.append(TumblrPost(post, self)) except TumdlrParserError: continue
def _parse_result(self): """ Parse search result data. Raises: PoogleParserError: Raised if the result can not be parsed for any reason """ self.title = self._soup.a.text self._log.info('Result title parsed: %s', self.title) # Make sure this is a valid result URL (and not a link to image results, as an example). href = self._soup.a.get('href') if not href.startswith('/url?'): raise PoogleParserError('Unrecognized URL format: %s', href) match = self.url_regex.match(href) if not match or not match.group('url'): self._log.error( 'Unable to parse search result URL: {h}'.format(h=href)) raise PoogleParserError('Unable to parse search result URL: %s', href) url = unquote(match.group('url')) self.url = URL(url) self._log.info('Result URL parsed: %s', self.url)
def __parse_urls(self, row, url_params, job_name, job_id=None): for u in row['urls']: try: parsed = URL(u['expanded_url']) url_params.append({ 'tweet_id': row['status_id'], 'url': u['expanded_url'], 'job_id': job_id, 'job_name': job_name, 'schema': parsed.scheme, 'netloc': parsed.authority, 'path': parsed.path, 'params': '', #parsed.params, 'query': parsed.query, 'fragment': parsed.fragment, 'username': parsed.username, 'password': parsed.authorization, 'hostname': parsed.host, 'port': parsed.port, }) except Exception as inst: logging.error(type(inst)) # the exception instance logging.error(inst.args) # arguments stored in .args # __str__ allows args to be printed directly, logging.error(inst) return url_params
def normalize_image_url(url): """ takes an s3 url or relative url and returns the part that is saved in the database (relative to the storage root). """ if url.startswith('http://') or url.startswith('https://'): url = URL(url).path bucket = '/{}/'.format(timelapse_storage.bucket_name) if url.startswith(bucket): url = url[len(bucket):] if url.startswith(timelapse_storage.location): url = url[len(timelapse_storage.location):] if hasattr(timelapse_storage, 'base_url') and url.startswith(timelapse_storage.base_url): url = url[len(timelapse_storage.base_url):] if url.startswith('/'): url = url[1:] return url
def __init__(self, url, session=None, **kwargs): """ Tumblr blog Args: url(URL|str): Tumblr profile URL session(Optional[Session]): An optional custom Requests session Keyword Args: api_key(str): Tumblr API key uagent(str): Custom User-Agent header """ self._url = url if isinstance(url, URL) else URL(url) self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/') self._api_response = None # type: Response self._api_key = kwargs.get( 'api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4') self._uagent = kwargs.get('user_agent', 'tumdlr/{version}') if not session: session = Session() session.headers.update({ 'Referer': urllib.parse.quote(self._url.as_string()), 'User-Agent': self._uagent.format(version=__version__) }) self.session = session self.title = None # type: str self.url = None # type: URL self.name = None # type: str self.description = None # type: str self.is_nsfw = None # type: bool self.likes = None # type: int|False self.post_count = None # type: int self.updated = None # type: int self._posts = [] self.offset = 0 self._api_url = self._api_url.replace( path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host)) self._api_get()
def get_url(self, data=None): cfg = self.config service = cfg.services[self.get_name()] dependency = cfg.dependencies[service['dependency']][cfg.env] serv = copy(service) del serv['dependency'] url_options = {**serv, **dependency} return URL(**url_options)
def test_setdefault(self): empty = URL() full1 = URL('scheme://user@host:80/path?query#frgment') full2 = URL('an://oth@er:33/full?url#!!') self.assertEqual(empty.setdefault(*full1._data), full1) self.assertEqual(full1.setdefault(*full2._data), full1) for idx, (field, value) in enumerate(zip(full1._fields, full1._data)): self.assertEqual(empty.setdefault(**{field: value}), empty.replace(**{field: value})) self.assertEqual(empty.setdefault(**{field: value})[idx], value) self.assertEqual(full2.setdefault(**{field: value})[idx], full2[idx])
def get_hostname_from_url(url): hostname = "" if _url_enabled: hostname = URL(url).host.strip("[]") else: hostname = urlparse(url).hostname if not hostname: hostname = url.lower() return hostname
def InfluxDB(uri): logging.debug('Parsing uri "{}"'.format(uri)) p = URL(uri) if p.scheme != 'influx': raise WrongSchemeException(uri) return InfluxDBClient(p.host, p.port or 8086, p.username, p.authorization, p.path.lstrip('/'))
def test_stress_authority(self): # Authority is most ambiguous part of url. Invalid host can contatin # ':' and '@' (path for example can not contain '?'. And query # can not contain '#'). The host '//no:99:' will be parsed as 'no:99' # and in next recomposition it can be written as '//no:99'. But parsing # of '//no:99:' and '//no:99' will be different. # # case generation: # from re import sub # from itertools import permutations # cases = set(sub('\d+', '7', ''.join(case)) # for case in set(permutations('::@@77777'))) cases = """7:7:7@7@ 7:7:7@7@7 7:7:7@@ 7:7:7@@7 7:7:@7@ 7:7:@7@7 7:7:@@ 7:7:@@7 7:7@7:7@ 7:7@7:7@7 7:7@7:@ 7:7@7:@7 7:7@7@7: 7:7@7@7:7 7:7@7@: 7:7@7@:7 7:7@:7@ 7:7@:7@7 7:7@:@ 7:7@:@7 7:7@@7: 7:7@@7:7 7:7@@: 7:7@@:7 7::7@7@ 7::7@7@7 7::7@@ 7::7@@7 7::@7@ 7::@7@7 7::@@ 7::@@7 7:@7:7@ 7:@7:7@7 7:@7:@ 7:@7:@7 7:@7@7: 7:@7@7:7 7:@7@: 7:@7@:7 7:@:7@ 7:@:7@7 7:@:@ 7:@:@7 7:@@7: 7:@@7:7 7:@@: 7:@@:7 7@7:7:7@ 7@7:7:7@7 7@7:7:@ 7@7:7:@7 7@7:7@7: 7@7:7@7:7 7@7:7@: 7@7:7@:7 7@7::7@ 7@7::7@7 7@7::@ 7@7::@7 7@7:@7: 7@7:@7:7 7@7:@: 7@7:@:7 7@7@7:7: 7@7@7:7:7 7@7@7:: 7@7@7::7 7@7@:7: 7@7@:7:7 7@7@:: 7@7@::7 7@:7:7@ 7@:7:7@7 7@:7:@ 7@:7:@7 7@:7@7: 7@:7@7:7 7@:7@: 7@:7@:7 7@::7@ 7@::7@7 7@::@ 7@::@7 7@:@7: 7@:@7:7 7@:@: 7@:@:7 7@@7:7: 7@@7:7:7 7@@7:: 7@@7::7 7@@:7: 7@@:7:7 7@@:: 7@@::7 :7:7@7@ :7:7@7@7 :7:7@@ :7:7@@7 :7:@7@ :7:@7@7 :7:@@ :7:@@7 :7@7:7@ :7@7:7@7 :7@7:@ :7@7:@7 :7@7@7: :7@7@7:7 :7@7@: :7@7@:7 :7@:7@ :7@:7@7 :7@:@ :7@:@7 :7@@7: :7@@7:7 :7@@: :7@@:7 ::7@7@ ::7@7@7 ::7@@ ::7@@7 ::@7@ ::@7@7 ::@@7 :@7:7@ :@7:7@7 :@7:@ :@7:@7 :@7@7: :@7@7:7 :@7@: :@7@:7 :@:7@ :@:7@7 :@:@7 :@@7: :@@7:7 :@@:7 @7:7:7@ @7:7:7@7 @7:7:@ @7:7:@7 @7:7@7: @7:7@7:7 @7:7@: @7:7@:7 @7::7@ @7::7@7 @7::@ @7::@7 @7:@7: @7:@7:7 @7:@: @7:@:7 @7@7:7: @7@7:7:7 @7@7:: @7@7::7 @7@:7: @7@:7:7 @7@:: @7@::7 @:7:7@ @:7:7@7 @:7:@ @:7:@7 @:7@7: @:7@7:7 @:7@: @:7@:7 @::7@ @::7@7 @::@7 @:@7: @:@7:7 @:@:7 @@7:7: @@7:7:7 @@7:: @@7::7 @@:7: @@:7:7 @@::7""".split() for case in cases: url = URL('//' + case) # check is all parts defined in original url is defined in parsed self.assertEqual(url, URL(url.as_string())) self.assertEqual(url, URL('//' + url.authority))
def test_test(self): def test_valid(url, relative, relative_path): self.assertEqual(URL(url).is_relative(), relative) self.assertEqual(URL(url).is_relative_path(), relative_path) test_valid('sc:', False, False) test_valid('sc:path/', False, False) test_valid('//host', True, False) test_valid('/path', True, False) test_valid('path/', True, True) test_valid('./path/', True, True) test_valid('?true', True, True) def test_ip(url, host_ip, host_ipv4): self.assertEqual(URL(url).is_host_ip(), host_ip) self.assertEqual(URL(url).is_host_ipv4(), host_ipv4) test_ip('', False, False) test_ip('//google/', False, False) test_ip('//127.0.1', False, False) test_ip('//127.0.0.1', True, True) test_ip('//[127.0.0.1]', True, False) self.assertTrue(URL('/url')) self.assertTrue(URL('url:')) self.assertTrue(URL('//url')) self.assertTrue(URL('?url')) self.assertTrue(URL('#url')) self.assertFalse(URL('//@:?#'))
def one_try(self, url, scheme='', host='', path='', query='', fragment='', userinfo='', port='', invalid=None, urlsplit=True): orih_url = url url = URL(url) splitted = (scheme, userinfo, host, port, path, query, fragment) self.assertEqual(url._data, splitted) self.assertEqual(URL(None, *splitted)._data, splitted) self.assertEqual(URL(None, *url._data)._data, splitted) if invalid: self.assertRaises(invalid, url.validate) else: url.validate() if urlsplit and '-v' in sys.argv: splitted = (scheme, url.authority, path, query, fragment) split_result = self.split(orih_url) if split_result != splitted: print('\n urllib issue:', orih_url, self.split(orih_url)) elif (split_result.hostname or '') != host: print('\n urllib issue:', orih_url, 'host is:', split_result.hostname, 'host should:', host)
def __parse_urls_twint(self, df, job_name, job_id): counter = 0 url_params_lst = [] try: for index, row in df.iterrows(): if row["urls"]: urls = [url for url in row["urls"]] parsed = URL(urls[counter]) url_params_lst.append( pd.DataFrame([{ 'tweet_id': int(row["status_id"]), 'full_url': urls[counter], 'job_id': job_id, 'job_name': job_name, 'schema': parsed.scheme, 'netloc': parsed.authority, 'path': parsed.path, 'params': '', #parsed.params, 'query': parsed.query, 'fragment': parsed.fragment, 'username': parsed.username, 'password': parsed.authorization, 'hostname': parsed.host, 'port': parsed.port }])) except Exception as e: logging.error('params.append exn', e) logging.error('row', row) raise e if len(url_params_lst) == 0: return pd.DataFrame({ 'tweet_id': pd.Series([], dtype='int64'), 'full_url': pd.Series([], dtype='object'), 'job_id': pd.Series([], dtype='object'), 'job_name': pd.Series([], dtype='object'), 'schema': pd.Series([], dtype='object'), 'netloc': pd.Series([], dtype='object'), 'path': pd.Series([], dtype='object'), 'params': pd.Series([], dtype='object'), 'query': pd.Series([], dtype='object'), 'fragment': pd.Series([], dtype='object'), 'username': pd.Series([], dtype='object'), 'password': pd.Series([], dtype='object'), 'hostname': pd.Series([], dtype='object'), 'port': pd.Series([], dtype='int64') }) url_df = pd.concat(url_params_lst, ignore_index=True, sort=False) counter += 1 return url_df
def __init__(self, name, credentials, region, qualifier='$LATEST'): """Creates a client of AWS Lambda function with ability to invoke it synchronously by RequestResponse invocation type. By `synchronously` it means, that caller will receive lambda function call result in place. Request to AWS Lambda will be made asynchronously. See http://docs.aws.amazon.com/lambda/latest/dg/API_Invoke.html for details. Usage example: _ioloop = ioloop.IOLoop.instance() @gen.coroutine def async_request(): credentials = Credentials(access_key=<access_key>, secret_key=<secret_key>) payload = {'input_bucket': 'bucket', ...} service = Lambda('some-service', credentials, <region>) result = yield service(payload) _ioloop.stop() _ioloop.add_callback(async_request) _ioloop.start() :param name: Name of the AWS Lambda function. :param credentials: AWS credentials. :param region: AWS Lambda function region. :param qualifier: Lambda function alias or version. """ self.name = name self.region = region self._credentials = credentials self.client = CurlAsyncHTTPClient() if qualifier: query = 'Qualifier={0}'.format(quote(qualifier)) else: query = None self.url = URL(scheme='https', host='lambda.{0}.amazonaws.com'.format(region), path='{0}/functions/{1}/invocations'.format( self.API_VERSION, name), query=query) self.url = str(self.url)
def _init_mqtt(self, uri): log.debug('Parsing uri "{}"'.format(uri)) p = URL(uri) if p.scheme != 'mqtt': raise WrongSchemeException(uri) self.client = mqtt.Client() if p.username: self.client.username_pw_set(p.username, p.authorization) self.client.enable_logger() self.client.connect( p.host, int(p.port) or 1883, )
def __init__(self, url, session=None, **kwargs): """ Tumblr blog Args: url(URL|str): Tumblr profile URL session(Optional[Session]): An optional custom Requests session Keyword Args: api_key(str): Tumblr API key uagent(str): Custom User-Agent header """ self._url = url if isinstance(url, URL) else URL(url) self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/') self._api_response = None # type: Response self._api_key = kwargs.get('api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4') self._uagent = kwargs.get('user_agent', 'tumdlr/{version}') if not session: session = Session() session.headers.update({ 'Referer': urllib.parse.quote(self._url.as_string()), 'User-Agent': self._uagent.format(version=__version__) }) self.session = session self.title = None # type: str self.url = None # type: URL self.name = None # type: str self.description = None # type: str self.is_nsfw = None # type: bool self.likes = None # type: int|False self.post_count = None # type: int self.updated = None # type: int self._posts = [] self.offset = 0 self._api_url = self._api_url.replace( path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host) ) self._api_get()
def test_constructor(self): # args self.assertEqual(URL('a://*****:*****@d:5/f?g#h'), URL(None, 'a', 'b:c', 'd', '5', '/f', 'g', 'h')) # kwargs self.assertEqual( URL('a://*****:*****@d:5/f?g#h'), URL(scheme='a', userinfo='b:c', host='d', port='5', path='/f', query='g', fragment='h')) # ignore self.assertEqual(URL('//host'), URL('//host', scheme='sh', port='80'))
def __init__(self, base_url, **kwargs): u = URL(base_url) if not (u.scheme and u.host): raise EzReqURLError('Unsupported URL!') # Backup scheme self._scheme = u.scheme self._base_url = base_url self._session = Session() self._last_url = base_url # `self._headers` -> `self._session.headers` self._headers = self._session.headers headers = kwargs.pop('headers', {}) self._session.headers.update(headers) max_retries = kwargs.pop('max_retries', 3) self._session.mount('http://', HTTPAdapter(max_retries=max_retries)) self._session.mount('https://', HTTPAdapter(max_retries=max_retries))
def normalize_image_url(url): """ takes an s3 url or relative url and returns the part that is saved in the database (relative to the storage root). """ if url.startswith("http://") or url.startswith("https://"): url = URL(url).path bucket = "/{}/".format(timelapse_storage.bucket_name) if url.startswith(bucket): url = url[len(bucket):] if url.startswith(timelapse_storage.location): url = url[len(timelapse_storage.location):] if hasattr(timelapse_storage, "base_url") and url.startswith( timelapse_storage.base_url): url = url[len(timelapse_storage.base_url):] if url.startswith("/"): url = url[1:] return url
def wrapped_fn(self, url, **kwargs): u = URL(url) # Fix params not appears in referer try: params = kwargs.pop('params') url = '{url}?{params}'.format(url=url, params=urlencode(params)) except KeyError: pass if u.scheme and u.host: # pylint: disable=protected-access self._base_url = str(u.replace(full_path='')) # pylint: disable=protected-access self._scheme = u.scheme # Update scheme if url.startswith(r'//'): # '//example.com' # pylint: disable=protected-access url = '{scheme}:{where}'.format(scheme=self._scheme, where=url) self._base_url = url # pylint: disable=protected-access elif url.startswith(r'?'): # '?page=rss' url = '/' + url # -> '/?page=rss' url = urljoin(self._base_url, url) # pylint: disable=protected-access else: # '/?page=rss' 'page=rss' url = urljoin(self._base_url, url) # pylint: disable=protected-access # pylint: disable=protected-access u = URL(self._last_url) # pylint: disable=protected-access self._headers.update({ # HTTP/2 Headers lowercase only 'origin': str(u.replace(full_path='')), 'referer': self._last_url }) self._last_url = url # pylint: disable=protected-access return fn(self, url, **kwargs)
class Scanner(object): FINGERPRINTS = [ { "type": "git", "base": ".git", "files": ["index"] }, { "type": "svn", "base": ".svn", "files": ["wc.db"] }, { "type": "svn_old", "base": ".svn", "files": ["entries"] }, #{ # "type": "hg", # "base": ".hg", # "files": ["store/00manifest.i"] #} ] SCHEMES = ["HTTP", "HTTPS"] def __init__(self, host): self.host = URL(host).replace(path = "", query = "", fragment = "") self.session = HTTP() self.session.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" def scan_host(self): for scheme in self.SCHEMES: for fingerprint in self.FINGERPRINTS: for file in fingerprint['files']: url = self.host.replace(path = fingerprint['base'] + "/" + file, scheme = scheme) url = str(url) response = self.session.get(url, verify=False) if response.status_code == 200 and self._filter_false_positive(response.content, fingerprint['type']): return { "file": file, "type": fingerprint['type'], "scheme": scheme, "data": response.content, "host": self.host.replace(scheme = scheme) } else: pass if(response.status_code == 200): pass #print "Failed: File exists, but failed verification." else: pass #print "Failed: HTTP " + str(response.status_code) return False def _filter_false_positive(self, data, type): if "<html" in data and "</html>" in data: return False if type == "git": if data[0:4] != "DIRC": return False if type == "svn_old": if "dir" not in data or "file" not in data: return False if type == "svn": if data[0:13] != "SQLite format": return False if type == "hg": if not data.statswith(".hgtag"): return False return True
class TumblrBlog: def __init__(self, url, session=None, **kwargs): """ Tumblr blog Args: url(URL|str): Tumblr profile URL session(Optional[Session]): An optional custom Requests session Keyword Args: api_key(str): Tumblr API key uagent(str): Custom User-Agent header """ self._url = url if isinstance(url, URL) else URL(url) self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/') self._api_response = None # type: Response self._api_key = kwargs.get('api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4') self._uagent = kwargs.get('user_agent', 'tumdlr/{version}') if not session: session = Session() session.headers.update({ 'Referer': urllib.parse.quote(self._url.as_string()), 'User-Agent': self._uagent.format(version=__version__) }) self.session = session self.title = None # type: str self.url = None # type: URL self.name = None # type: str self.description = None # type: str self.is_nsfw = None # type: bool self.likes = None # type: int|False self.post_count = None # type: int self.updated = None # type: int self._posts = [] self.offset = 0 self._api_url = self._api_url.replace( path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host) ) self._api_get() def _api_get(self, query=None, parse=True): """ Execute an API query Args: query(Optional[dict]): Extra query parameters parse(Optional[bool]): Parse the API response immediately """ # Parse extra query parameters query_extra = [] if query: for key, value in query.items(): query_extra.append( '{key}={value}'.format( key=urllib.parse.quote(key), value=urllib.parse.quote(value) ) ) # Only prepend an ampersand if we have extra attributes, otherwise default to an empty string if query_extra: query_extra = '&' + '&'.join(query_extra) else: query_extra = '' endpoint = self._api_url.replace( query='api_key={api_key}&filter=text&offset={offset}{extra}'.format( api_key=self._api_key, offset=self.offset, extra=query_extra ) ) response = self.session.get(endpoint.as_string()) # type: Response response.raise_for_status() self._api_response = response if parse: self._api_parse_response() def _api_parse_response(self): """ Parse an API response """ blog = self._api_response.json()['response']['blog'] self.title = blog['title'] self.url = URL(blog['url']) self.name = blog['name'] self.description = blog['description'] self.is_nsfw = blog['is_nsfw'] self.likes = blog.get('likes', False) # Returned only if sharing of likes is enabled self.post_count = blog['posts'] self.updated = blog['updated'] posts = self._api_response.json()['response']['posts'] for post in posts: try: if post['type'] in ['photo', 'link']: self._posts.append(TumblrPhotoSet(post, self)) continue elif post['type'] == 'video': self._posts.append(TumblrVideoPost(post, self)) continue self._posts.append(TumblrPost(post, self)) except TumdlrParserError: continue def posts(self): """ Yields: TumblrPost """ while True: # Out of posts? if not self._posts: # Do we have any more to query? self._api_get() if not self._posts: # Nope, we've queried everything, break now break # Pop our next post and increment the offset post = self._posts.pop(0) self.offset += 1 yield post
class TumblrPost: """ This is the base container class for all Tumblr post types. It contains data that is always available with any type of post. Additional supported post types may extend this class to provide additional metadata parsing """ def __init__(self, post, blog): """ Args: post(dict): API response blog(tumdlr.api.TumblrBlog): Parent blog """ self._post = post self.blog = blog self.log = logging.getLogger('tumdlr.containers.post') self.id = None # type: int self.type = None # type: str self.url = None # type: URL self.tags = set() self.post_date = None # type: str self.note_count = None # type: int self.files = [] try: self._parse_post() except Exception as e: self.log.warn('Failed to parse post data: %r', self, exc_info=e) raise TumdlrParserError(post_data=post) @property def is_text(self): """ Returns: bool """ return self.type == 'text' @property def is_photo(self): """ Returns: bool """ return self.type in ['photo', 'link'] @property def is_video(self): """ Returns: bool """ return self.type == 'video' def _parse_post(self): self.id = self._post['id'] self.type = self._post['type'] self.url = URL(self._post['post_url']) if 'post_url' in self._post else None self.tags = set(self._post.get('tags', [])) self.note_count = self._post.get('note_count') self.post_date = self._post['date'] def __repr__(self): return "<TumblrPost id='{id}' type='{type}' url='{url}'>"\ .format(id=self.id, type=self.type, url=self.url) def __str__(self): return self.url.as_string() if self.url else ''
def test_full_path(self): for url in [ '', 'path', 'path?query', 'path#fragment', 'path?query#fragment', '?query', '#fragment', '?query#fragment' ]: self.assertEqual(URL(url).full_path, url)
def __init__(self, host): self.host = URL(host).replace(path = "", query = "", fragment = "") self.session = HTTP() self.session.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
def test_str(self): for url in [ '', '//host', '//host/' 'scheme://host', '//host/path', '?query', 'path?query', 'http:', 'http:?query', '//host?query' ]: self.assertEqual(str(URL(url)), url) self.assertEqual(URL(str(URL(url))), URL(url)) # should append slash to path self.assertEqual(str(URL(host='host', path='path')), '//host/path') self.assertEqual(str(URL(host='host', path='//path')), '//host//path') self.assertEqual(str(URL(path='//path').validate()), '////path') self.assertEqual(str(URL(path='//pa:th').validate()), '////pa:th') self.assertEqual(str(URL(path='pa:th').validate()), './pa:th') self.assertEqual(str(URL(path='not/pa:th').validate()), 'not/pa:th') self.assertEqual(str(URL(path='pa:th/not').validate()), './pa:th/not')
def test_ip(url, host_ip, host_ipv4): self.assertEqual(URL(url).is_host_ip(), host_ip) self.assertEqual(URL(url).is_host_ipv4(), host_ipv4)
def test_valid(url, relative, relative_path): self.assertEqual(URL(url).is_relative(), relative) self.assertEqual(URL(url).is_relative_path(), relative_path)
def test_replace(self): for url in [ URL('htttp://[email protected]:8080/path?query#fragment'), URL(), URL('path'), URL('//host:80') ]: self.assertFalse(url is url.replace(host='strange')) self.assertEqual(url, url.replace()) for idx, (field, value) in enumerate(zip(url._fields, url._data)): # replase to same self.assertEqual(url.replace(**{field: value}), url) # clear self.assertEqual(url.replace(**{field: ''})[idx], '') # replace to some if url.has_authority() and field == 'path': self.assertEqual(url.replace(**{field: 'an'})[idx], '/an') else: self.assertEqual(url.replace(**{field: 'an'})[idx], 'an') for url, authority in [(URL('a://*****:*****@d:5/f?g#h'), 'blah'), (URL('a://blah/f?g#h'), '')]: orig_autho = url.authority url = url.replace(authority=authority) self.assertEqual(url.authority, authority) url = url.replace(authority=orig_autho) self.assertEqual(url.authority, orig_autho) for url, full_path in [(URL('a://*****:*****@d:5/f?g#h'), ''), (URL('a://*****:*****@d:5/f?g#h'), '/path'), (URL('a://*****:*****@d:5/f?g#h'), '/path?qr'), (URL('a://*****:*****@d:5/f?g#h'), '?qr'), (URL('a://*****:*****@d:5/f?g#h'), '?qr#fr'), (URL('a://*****:*****@d:5/f?g#h'), '#fr'), (URL('a://*****:*****@d:5'), '/path')]: orig_path = url.full_path url = url.replace(full_path=full_path) self.assertEqual(url.full_path, full_path) url = url.replace(full_path=orig_path) self.assertEqual(url.full_path, orig_path)
def test_setdefault(self): empty = URL() full1 = URL('scheme://user@host:80/path?query#frgment') full2 = URL('an://oth@er:33/full?url#!!') self.assertEqual(empty.setdefault(*full1._data), full1) self.assertEqual(full1.setdefault(*full2._data), full1) for idx, (field, value) in enumerate(zip(full1._fields, full1._data)): self.assertEqual(empty.setdefault(**{field: value}), empty.replace(**{field: value})) self.assertEqual(empty.setdefault(**{field: value})[idx], value) self.assertEqual( full2.setdefault(**{field: value})[idx], full2[idx])