Esempio n. 1
0
 def _parse_post(self):
     self.id         = self._post['id']
     self.type       = self._post['type']
     self.url        = URL(self._post['post_url']) if 'post_url' in self._post else None
     self.tags       = set(self._post.get('tags', []))
     self.note_count = self._post.get('note_count')
     self.post_date  = self._post['date']
Esempio n. 2
0
def set_language_and_region(request, region_code="GL", language_code="en"):
    """
    Adapted from the Django's set_language

    Redirect to a given url while setting the chosen language in the
    session or cookie. The url and the language code need to be
    specified in the request parameters, or will be taken from HTTP_REFERER
    """
    next_url = request.POST.get('next', request.GET.get('next'))
    if not is_safe_url(url=next_url, host=request.get_host()):
        next_url = request.META.get('HTTP_REFERER')
        if not is_safe_url(url=next_url, host=request.get_host()):
            next_url = '/GL/en/'  # Default global region with English language.
    # In case of bogus information fall back to default region or language for that region if exists.
    region, new_language_code = get_region(region_code, language_code)
    if new_language_code != language_code:
        language_code = new_language_code

    old_path = URL(next_url).path
    if old_path == "/":
        new_path = "/%s/" % "/".join([region.code, language_code])
    else:
        new_path = "/" + "/".join([region.code, language_code] + old_path.split("/")[3:])
    next_url = URL(next_url).replace(path=new_path)
    response = http.HttpResponseRedirect(next_url)

    if hasattr(request, 'session'):
        request.session[LANGUAGE_SESSION_KEY] = language_code
    else:
        response.set_cookie(settings.LANGUAGE_COOKIE_NAME, language_code,
                            max_age=settings.LANGUAGE_COOKIE_AGE,
                            path=settings.LANGUAGE_COOKIE_PATH,
                            domain=settings.LANGUAGE_COOKIE_DOMAIN)
    return response
Esempio n. 3
0
 def test_decode(self):
     for enc, dec in [('http://%D0%BF%D1%8C%D0%B5%D1%[email protected]/'
                       '%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8',
                       'http://пьер@local.com/записи'), ('/%2525', '/%25')]:
         self.assertEqual(URL(enc).decode()._data, URL(dec)._data)
         self.assertEqual(URL(enc).decode().as_string(), dec)
         self.assertEqual(URL(enc).decode().decode().as_string(), dec)
Esempio n. 4
0
class TumblrFile:
    """
    This is the base container class for all downloadable resources associated with Tumblr posts.
    """

    CATEGORY = 'misc'

    def __init__(self, data, container):
        """
        Args:
            data(dict): API response data
            container(TumblrPost): Parent container
        """
        self.log = logging.getLogger('tumdlr.containers.file')

        self._data      = data
        self.container  = container
        self.url        = URL(self._data.get('url', self._data.get('post_url')))

    def download(self, context, **kwargs):
        """
        Args:
            context(tumdlr.main.Context): CLI request context
            kwargs(dict): Additional arguments to send with the download request

        Returns:
            str: Path to the saved file
        """
        try:
            download(self.url.as_string(), str(self.filepath(context, kwargs)), **kwargs)
        except Exception as e:
            self.log.warn('Post download failed: %r', self, exc_info=e)
            raise TumdlrDownloadError(error_message=str(e), download_url=self.url.as_string())

    def filepath(self, context, request_data):
        """
        Args:
            context(tumdlr.main.Context): CLI request context
            request_data(Optional[dict]): Additional arguments to send with the download request

        Returns:
            Path
        """
        # Construct the save basedir
        basedir = Path(context.config['Tumdlr']['SavePath'])

        # Are we categorizing by user?
        if context.config['Categorization']['User']:
            self.log.debug('Categorizing by user: %s', self.container.blog.name)
            basedir = basedir.joinpath(sanitize_filename(self.container.blog.name))

        # Are we categorizing by post type?
        if context.config['Categorization']['PostType']:
            self.log.debug('Categorizing by type: %s', self.CATEGORY)
            basedir = basedir.joinpath(self.CATEGORY)

        self.log.debug('Basedir constructed: %s', basedir)

        return basedir
Esempio n. 5
0
 def test_username_and_authorization(self):
     for userinfo, un, az in [('user', 'user', ''), ('user:'******'user', ''),
                              ('user:pass', 'user', 'pass'),
                              ('user:pass:buzz', 'user', 'pass:buzz'),
                              (':pass', '', 'pass'),
                              (':pass:buzz', '', 'pass:buzz'), ('', '', ''),
                              (':', '', ''), ('::', '', ':')]:
         self.assertEqual(URL(userinfo=userinfo).username, un)
         self.assertEqual(URL(userinfo=userinfo).authorization, az)
    def __init__(self, connection_string: str, profile_name: str = 'default'):
        self.creds = {}
        self.url = URL(connection_string)
        self.profile = profile_name
        self.qs = dict(parse_qsl(self.url.query))

        if self._gather_aws_creds():
            self._update_url()
        else:
            raise InvalidAWSSession
Esempio n. 7
0
    def __init__(self, data, container):
        """
        Args:
            data(dict): API response data
            container(TumblrPost): Parent container
        """
        self.log = logging.getLogger('tumdlr.containers.file')

        self._data      = data
        self.container  = container
        self.url        = URL(self._data.get('url', self._data.get('post_url')))
Esempio n. 8
0
    def test_pickling(self):
        import pickle
        dump = pickle.dumps(URL('a://*****:*****@d:5/f?g#h'))
        self.assertEqual(pickle.loads(dump), URL('a://*****:*****@d:5/f?g#h'))

        global _test_picklingURL

        class _test_picklingURL(URL):
            def __new__(cls, path):
                return super(_test_picklingURL, cls).__new__(cls, path)

        url = _test_picklingURL('a://*****:*****@d:5/f?g#h')
        self.assertEqual(pickle.loads(pickle.dumps(url)), url)
        self.assertEqual(type(pickle.loads(pickle.dumps(url))), type(url))
Esempio n. 9
0
File: api.py Progetto: socr8s/tumdlr
    def _api_parse_response(self):
        """
        Parse an API response

        """
        blog = self._api_response.json()['response']['blog']

        self.title = blog['title']
        self.url = URL(blog['url'])
        self.name = blog['name']
        self.description = blog['description']
        self.is_nsfw = blog['is_nsfw']
        self.likes = blog.get(
            'likes', False)  # Returned only if sharing of likes is enabled
        self.post_count = blog['posts']
        self.updated = blog['updated']

        posts = self._api_response.json()['response']['posts']

        for post in posts:
            try:
                if post['type'] in ['photo', 'link']:
                    self._posts.append(TumblrPhotoSet(post, self))
                    continue
                elif post['type'] == 'video':
                    self._posts.append(TumblrVideoPost(post, self))
                    continue

                self._posts.append(TumblrPost(post, self))
            except TumdlrParserError:
                continue
Esempio n. 10
0
    def _parse_result(self):
        """
        Parse search result data.

        Raises:
            PoogleParserError:  Raised if the result can not be parsed for any reason
        """
        self.title = self._soup.a.text
        self._log.info('Result title parsed: %s', self.title)

        # Make sure this is a valid result URL (and not a link to image results, as an example).
        href = self._soup.a.get('href')
        if not href.startswith('/url?'):
            raise PoogleParserError('Unrecognized URL format: %s', href)

        match = self.url_regex.match(href)
        if not match or not match.group('url'):
            self._log.error(
                'Unable to parse search result URL: {h}'.format(h=href))
            raise PoogleParserError('Unable to parse search result URL: %s',
                                    href)

        url = unquote(match.group('url'))
        self.url = URL(url)
        self._log.info('Result URL parsed: %s', self.url)
 def __parse_urls(self, row, url_params, job_name, job_id=None):
     for u in row['urls']:
         try:
             parsed = URL(u['expanded_url'])
             url_params.append({
                 'tweet_id': row['status_id'],
                 'url': u['expanded_url'],
                 'job_id': job_id,
                 'job_name': job_name,
                 'schema': parsed.scheme,
                 'netloc': parsed.authority,
                 'path': parsed.path,
                 'params': '',  #parsed.params,
                 'query': parsed.query,
                 'fragment': parsed.fragment,
                 'username': parsed.username,
                 'password': parsed.authorization,
                 'hostname': parsed.host,
                 'port': parsed.port,
             })
         except Exception as inst:
             logging.error(type(inst))  # the exception instance
             logging.error(inst.args)  # arguments stored in .args
             # __str__ allows args to be printed directly,
             logging.error(inst)
     return url_params
Esempio n. 12
0
def normalize_image_url(url):
    """
    takes an s3 url or relative url and returns the part that is saved in the
    database (relative to the storage root).
    """
    if url.startswith('http://') or url.startswith('https://'):
        url = URL(url).path
        bucket = '/{}/'.format(timelapse_storage.bucket_name)
        if url.startswith(bucket):
            url = url[len(bucket):]
        if url.startswith(timelapse_storage.location):
            url = url[len(timelapse_storage.location):]
    if hasattr(timelapse_storage, 'base_url') and url.startswith(timelapse_storage.base_url):
        url = url[len(timelapse_storage.base_url):]
    if url.startswith('/'):
        url = url[1:]
    return url
Esempio n. 13
0
File: api.py Progetto: socr8s/tumdlr
    def __init__(self, url, session=None, **kwargs):
        """
        Tumblr blog

        Args:
            url(URL|str): Tumblr profile URL
            session(Optional[Session]): An optional custom Requests session

        Keyword Args:
            api_key(str): Tumblr API key
            uagent(str): Custom User-Agent header
        """
        self._url = url if isinstance(url, URL) else URL(url)
        self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/')
        self._api_response = None  # type: Response
        self._api_key = kwargs.get(
            'api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4')
        self._uagent = kwargs.get('user_agent', 'tumdlr/{version}')

        if not session:
            session = Session()
            session.headers.update({
                'Referer':
                urllib.parse.quote(self._url.as_string()),
                'User-Agent':
                self._uagent.format(version=__version__)
            })

        self.session = session

        self.title = None  # type: str
        self.url = None  # type: URL
        self.name = None  # type: str
        self.description = None  # type: str
        self.is_nsfw = None  # type: bool
        self.likes = None  # type: int|False
        self.post_count = None  # type: int
        self.updated = None  # type: int

        self._posts = []
        self.offset = 0

        self._api_url = self._api_url.replace(
            path=self._api_url.path +
            'blog/{host}/posts'.format(host=self._url.host))
        self._api_get()
Esempio n. 14
0
 def get_url(self, data=None):
     cfg = self.config
     service = cfg.services[self.get_name()]
     dependency = cfg.dependencies[service['dependency']][cfg.env]
     serv = copy(service)
     del serv['dependency']
     url_options = {**serv, **dependency}
     return URL(**url_options)
Esempio n. 15
0
File: test.py Progetto: homm/yurl
    def test_setdefault(self):
        empty = URL()
        full1 = URL('scheme://user@host:80/path?query#frgment')
        full2 = URL('an://oth@er:33/full?url#!!')

        self.assertEqual(empty.setdefault(*full1._data), full1)
        self.assertEqual(full1.setdefault(*full2._data), full1)

        for idx, (field, value) in enumerate(zip(full1._fields, full1._data)):
            self.assertEqual(empty.setdefault(**{field: value}),
                             empty.replace(**{field: value}))
            self.assertEqual(empty.setdefault(**{field: value})[idx], value)
            self.assertEqual(full2.setdefault(**{field: value})[idx],
                             full2[idx])
Esempio n. 16
0
def get_hostname_from_url(url):
    hostname = ""
    if _url_enabled:
        hostname = URL(url).host.strip("[]")
    else:
        hostname = urlparse(url).hostname
    if not hostname:
        hostname = url.lower()
    return hostname
Esempio n. 17
0
def InfluxDB(uri):
    logging.debug('Parsing uri "{}"'.format(uri))
    p = URL(uri)

    if p.scheme != 'influx':
        raise WrongSchemeException(uri)

    return InfluxDBClient(p.host, p.port or 8086, p.username, p.authorization,
                          p.path.lstrip('/'))
Esempio n. 18
0
    def test_stress_authority(self):
        # Authority is most ambiguous part of url. Invalid host can contatin
        # ':' and '@' (path for example can not contain '?'. And query
        # can not contain '#'). The host '//no:99:' will be parsed as 'no:99'
        # and in next recomposition it can be written as '//no:99'. But parsing
        # of '//no:99:' and '//no:99' will be different.

        # # case generation:
        # from re import sub
        # from itertools import permutations
        # cases = set(sub('\d+', '7', ''.join(case))
        #             for case in set(permutations('::@@77777')))

        cases = """7:7:7@7@ 7:7:7@7@7 7:7:7@@ 7:7:7@@7 7:7:@7@ 7:7:@7@7 7:7:@@
            7:7:@@7 7:7@7:7@ 7:7@7:7@7 7:7@7:@ 7:7@7:@7 7:7@7@7: 7:7@7@7:7
            7:7@7@: 7:7@7@:7 7:7@:7@ 7:7@:7@7 7:7@:@ 7:7@:@7 7:7@@7: 7:7@@7:7
            7:7@@: 7:7@@:7 7::7@7@ 7::7@7@7 7::7@@ 7::7@@7 7::@7@ 7::@7@7 7::@@
            7::@@7 7:@7:7@ 7:@7:7@7 7:@7:@ 7:@7:@7 7:@7@7: 7:@7@7:7 7:@7@:
            7:@7@:7 7:@:7@ 7:@:7@7 7:@:@ 7:@:@7 7:@@7: 7:@@7:7 7:@@: 7:@@:7
            7@7:7:7@ 7@7:7:7@7 7@7:7:@ 7@7:7:@7 7@7:7@7: 7@7:7@7:7 7@7:7@:
            7@7:7@:7 7@7::7@ 7@7::7@7 7@7::@ 7@7::@7 7@7:@7: 7@7:@7:7 7@7:@:
            7@7:@:7 7@7@7:7: 7@7@7:7:7 7@7@7:: 7@7@7::7 7@7@:7: 7@7@:7:7 7@7@::
            7@7@::7 7@:7:7@ 7@:7:7@7 7@:7:@ 7@:7:@7 7@:7@7: 7@:7@7:7 7@:7@:
            7@:7@:7 7@::7@ 7@::7@7 7@::@ 7@::@7 7@:@7: 7@:@7:7 7@:@: 7@:@:7
            7@@7:7: 7@@7:7:7 7@@7:: 7@@7::7 7@@:7: 7@@:7:7 7@@:: 7@@::7 :7:7@7@
            :7:7@7@7 :7:7@@ :7:7@@7 :7:@7@ :7:@7@7 :7:@@ :7:@@7 :7@7:7@
            :7@7:7@7 :7@7:@ :7@7:@7 :7@7@7: :7@7@7:7 :7@7@: :7@7@:7 :7@:7@
            :7@:7@7 :7@:@ :7@:@7 :7@@7: :7@@7:7 :7@@: :7@@:7 ::7@7@ ::7@7@7
            ::7@@ ::7@@7 ::@7@ ::@7@7 ::@@7 :@7:7@ :@7:7@7 :@7:@ :@7:@7 :@7@7:
            :@7@7:7 :@7@: :@7@:7 :@:7@ :@:7@7 :@:@7 :@@7: :@@7:7 :@@:7 @7:7:7@
            @7:7:7@7 @7:7:@ @7:7:@7 @7:7@7: @7:7@7:7 @7:7@: @7:7@:7 @7::7@
            @7::7@7 @7::@ @7::@7 @7:@7: @7:@7:7 @7:@: @7:@:7 @7@7:7: @7@7:7:7
            @7@7:: @7@7::7 @7@:7: @7@:7:7 @7@:: @7@::7 @:7:7@ @:7:7@7 @:7:@
            @:7:@7 @:7@7: @:7@7:7 @:7@: @:7@:7 @::7@ @::7@7 @::@7 @:@7: @:@7:7
            @:@:7 @@7:7: @@7:7:7 @@7:: @@7::7 @@:7: @@:7:7 @@::7""".split()

        for case in cases:
            url = URL('//' + case)
            # check is all parts defined in original url is defined in parsed
            self.assertEqual(url, URL(url.as_string()))
            self.assertEqual(url, URL('//' + url.authority))
Esempio n. 19
0
File: test.py Progetto: homm/yurl
    def test_stress_authority(self):
        # Authority is most ambiguous part of url. Invalid host can contatin
        # ':' and '@' (path for example can not contain '?'. And query
        # can not contain '#'). The host '//no:99:' will be parsed as 'no:99'
        # and in next recomposition it can be written as '//no:99'. But parsing
        # of '//no:99:' and '//no:99' will be different.

        # # case generation:
        # from re import sub
        # from itertools import permutations
        # cases = set(sub('\d+', '7', ''.join(case))
        #             for case in set(permutations('::@@77777')))

        cases = """7:7:7@7@ 7:7:7@7@7 7:7:7@@ 7:7:7@@7 7:7:@7@ 7:7:@7@7 7:7:@@
            7:7:@@7 7:7@7:7@ 7:7@7:7@7 7:7@7:@ 7:7@7:@7 7:7@7@7: 7:7@7@7:7
            7:7@7@: 7:7@7@:7 7:7@:7@ 7:7@:7@7 7:7@:@ 7:7@:@7 7:7@@7: 7:7@@7:7
            7:7@@: 7:7@@:7 7::7@7@ 7::7@7@7 7::7@@ 7::7@@7 7::@7@ 7::@7@7 7::@@
            7::@@7 7:@7:7@ 7:@7:7@7 7:@7:@ 7:@7:@7 7:@7@7: 7:@7@7:7 7:@7@:
            7:@7@:7 7:@:7@ 7:@:7@7 7:@:@ 7:@:@7 7:@@7: 7:@@7:7 7:@@: 7:@@:7
            7@7:7:7@ 7@7:7:7@7 7@7:7:@ 7@7:7:@7 7@7:7@7: 7@7:7@7:7 7@7:7@:
            7@7:7@:7 7@7::7@ 7@7::7@7 7@7::@ 7@7::@7 7@7:@7: 7@7:@7:7 7@7:@:
            7@7:@:7 7@7@7:7: 7@7@7:7:7 7@7@7:: 7@7@7::7 7@7@:7: 7@7@:7:7 7@7@::
            7@7@::7 7@:7:7@ 7@:7:7@7 7@:7:@ 7@:7:@7 7@:7@7: 7@:7@7:7 7@:7@:
            7@:7@:7 7@::7@ 7@::7@7 7@::@ 7@::@7 7@:@7: 7@:@7:7 7@:@: 7@:@:7
            7@@7:7: 7@@7:7:7 7@@7:: 7@@7::7 7@@:7: 7@@:7:7 7@@:: 7@@::7 :7:7@7@
            :7:7@7@7 :7:7@@ :7:7@@7 :7:@7@ :7:@7@7 :7:@@ :7:@@7 :7@7:7@
            :7@7:7@7 :7@7:@ :7@7:@7 :7@7@7: :7@7@7:7 :7@7@: :7@7@:7 :7@:7@
            :7@:7@7 :7@:@ :7@:@7 :7@@7: :7@@7:7 :7@@: :7@@:7 ::7@7@ ::7@7@7
            ::7@@ ::7@@7 ::@7@ ::@7@7 ::@@7 :@7:7@ :@7:7@7 :@7:@ :@7:@7 :@7@7:
            :@7@7:7 :@7@: :@7@:7 :@:7@ :@:7@7 :@:@7 :@@7: :@@7:7 :@@:7 @7:7:7@
            @7:7:7@7 @7:7:@ @7:7:@7 @7:7@7: @7:7@7:7 @7:7@: @7:7@:7 @7::7@
            @7::7@7 @7::@ @7::@7 @7:@7: @7:@7:7 @7:@: @7:@:7 @7@7:7: @7@7:7:7
            @7@7:: @7@7::7 @7@:7: @7@:7:7 @7@:: @7@::7 @:7:7@ @:7:7@7 @:7:@
            @:7:@7 @:7@7: @:7@7:7 @:7@: @:7@:7 @::7@ @::7@7 @::@7 @:@7: @:@7:7
            @:@:7 @@7:7: @@7:7:7 @@7:: @@7::7 @@:7: @@:7:7 @@::7""".split()

        for case in cases:
            url = URL('//' + case)
            # check is all parts defined in original url is defined in parsed
            self.assertEqual(url, URL(url.as_string()))
            self.assertEqual(url, URL('//' + url.authority))
Esempio n. 20
0
    def test_test(self):
        def test_valid(url, relative, relative_path):
            self.assertEqual(URL(url).is_relative(), relative)
            self.assertEqual(URL(url).is_relative_path(), relative_path)

        test_valid('sc:', False, False)
        test_valid('sc:path/', False, False)
        test_valid('//host', True, False)
        test_valid('/path', True, False)
        test_valid('path/', True, True)
        test_valid('./path/', True, True)
        test_valid('?true', True, True)

        def test_ip(url, host_ip, host_ipv4):
            self.assertEqual(URL(url).is_host_ip(), host_ip)
            self.assertEqual(URL(url).is_host_ipv4(), host_ipv4)

        test_ip('', False, False)
        test_ip('//google/', False, False)
        test_ip('//127.0.1', False, False)
        test_ip('//127.0.0.1', True, True)
        test_ip('//[127.0.0.1]', True, False)

        self.assertTrue(URL('/url'))
        self.assertTrue(URL('url:'))
        self.assertTrue(URL('//url'))
        self.assertTrue(URL('?url'))
        self.assertTrue(URL('#url'))
        self.assertFalse(URL('//@:?#'))
Esempio n. 21
0
    def one_try(self,
                url,
                scheme='',
                host='',
                path='',
                query='',
                fragment='',
                userinfo='',
                port='',
                invalid=None,
                urlsplit=True):
        orih_url = url
        url = URL(url)
        splitted = (scheme, userinfo, host, port, path, query, fragment)
        self.assertEqual(url._data, splitted)
        self.assertEqual(URL(None, *splitted)._data, splitted)
        self.assertEqual(URL(None, *url._data)._data, splitted)

        if invalid:
            self.assertRaises(invalid, url.validate)
        else:
            url.validate()

        if urlsplit and '-v' in sys.argv:
            splitted = (scheme, url.authority, path, query, fragment)
            split_result = self.split(orih_url)
            if split_result != splitted:
                print('\n  urllib issue:', orih_url, self.split(orih_url))
            elif (split_result.hostname or '') != host:
                print('\n  urllib issue:', orih_url, 'host is:',
                      split_result.hostname, 'host should:', host)
Esempio n. 22
0
File: test.py Progetto: homm/yurl
    def one_try(self, url, scheme='', host='', path='', query='', fragment='',
                userinfo='', port='', invalid=None, urlsplit=True):
        orih_url = url
        url = URL(url)
        splitted = (scheme, userinfo, host, port, path, query, fragment)
        self.assertEqual(url._data, splitted)
        self.assertEqual(URL(None, *splitted)._data, splitted)
        self.assertEqual(URL(None, *url._data)._data, splitted)

        if invalid:
            self.assertRaises(invalid, url.validate)
        else:
            url.validate()

        if urlsplit and '-v' in sys.argv:
            splitted = (scheme, url.authority, path, query, fragment)
            split_result = self.split(orih_url)
            if split_result != splitted:
                print('\n  urllib issue:', orih_url, self.split(orih_url))
            elif (split_result.hostname or '') != host:
                print('\n  urllib issue:', orih_url, 'host is:',
                      split_result.hostname, 'host should:', host)
 def __parse_urls_twint(self, df, job_name, job_id):
     counter = 0
     url_params_lst = []
     try:
         for index, row in df.iterrows():
             if row["urls"]:
                 urls = [url for url in row["urls"]]
                 parsed = URL(urls[counter])
                 url_params_lst.append(
                     pd.DataFrame([{
                         'tweet_id': int(row["status_id"]),
                         'full_url': urls[counter],
                         'job_id': job_id,
                         'job_name': job_name,
                         'schema': parsed.scheme,
                         'netloc': parsed.authority,
                         'path': parsed.path,
                         'params': '',  #parsed.params,
                         'query': parsed.query,
                         'fragment': parsed.fragment,
                         'username': parsed.username,
                         'password': parsed.authorization,
                         'hostname': parsed.host,
                         'port': parsed.port
                     }]))
     except Exception as e:
         logging.error('params.append exn', e)
         logging.error('row', row)
         raise e
     if len(url_params_lst) == 0:
         return pd.DataFrame({
             'tweet_id': pd.Series([], dtype='int64'),
             'full_url': pd.Series([], dtype='object'),
             'job_id': pd.Series([], dtype='object'),
             'job_name': pd.Series([], dtype='object'),
             'schema': pd.Series([], dtype='object'),
             'netloc': pd.Series([], dtype='object'),
             'path': pd.Series([], dtype='object'),
             'params': pd.Series([], dtype='object'),
             'query': pd.Series([], dtype='object'),
             'fragment': pd.Series([], dtype='object'),
             'username': pd.Series([], dtype='object'),
             'password': pd.Series([], dtype='object'),
             'hostname': pd.Series([], dtype='object'),
             'port': pd.Series([], dtype='int64')
         })
     url_df = pd.concat(url_params_lst, ignore_index=True, sort=False)
     counter += 1
     return url_df
Esempio n. 24
0
    def __init__(self, name, credentials, region, qualifier='$LATEST'):
        """Creates a client of AWS Lambda function with ability to invoke
         it synchronously by RequestResponse invocation type.

         By `synchronously` it means, that caller will receive lambda
         function call result in place. Request to AWS Lambda will be made
         asynchronously.

         See http://docs.aws.amazon.com/lambda/latest/dg/API_Invoke.html
         for details.

        Usage example:

          _ioloop = ioloop.IOLoop.instance()

          @gen.coroutine
          def async_request():
            credentials = Credentials(access_key=<access_key>,
                                      secret_key=<secret_key>)
            payload = {'input_bucket': 'bucket', ...}
            service = Lambda('some-service', credentials, <region>)
            result = yield service(payload)
            _ioloop.stop()

          _ioloop.add_callback(async_request)
          _ioloop.start()

        :param name: Name of the AWS Lambda function.
        :param credentials: AWS credentials.
        :param region: AWS Lambda function region.
        :param qualifier: Lambda function alias or version.

        """
        self.name = name
        self.region = region
        self._credentials = credentials
        self.client = CurlAsyncHTTPClient()

        if qualifier:
            query = 'Qualifier={0}'.format(quote(qualifier))
        else:
            query = None
        self.url = URL(scheme='https',
                       host='lambda.{0}.amazonaws.com'.format(region),
                       path='{0}/functions/{1}/invocations'.format(
                           self.API_VERSION, name),
                       query=query)
        self.url = str(self.url)
Esempio n. 25
0
    def _init_mqtt(self, uri):
        log.debug('Parsing uri "{}"'.format(uri))
        p = URL(uri)

        if p.scheme != 'mqtt':
            raise WrongSchemeException(uri)

        self.client = mqtt.Client()
        if p.username:
            self.client.username_pw_set(p.username, p.authorization)

        self.client.enable_logger()

        self.client.connect(
            p.host,
            int(p.port) or 1883,
        )
Esempio n. 26
0
    def __init__(self, url, session=None, **kwargs):
        """
        Tumblr blog

        Args:
            url(URL|str): Tumblr profile URL
            session(Optional[Session]): An optional custom Requests session

        Keyword Args:
            api_key(str): Tumblr API key
            uagent(str): Custom User-Agent header
        """
        self._url = url if isinstance(url, URL) else URL(url)
        self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/')
        self._api_response = None  # type: Response
        self._api_key = kwargs.get('api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4')
        self._uagent = kwargs.get('user_agent', 'tumdlr/{version}')

        if not session:
            session = Session()
            session.headers.update({
                'Referer': urllib.parse.quote(self._url.as_string()),
                'User-Agent': self._uagent.format(version=__version__)
            })

        self.session = session

        self.title          = None  # type: str
        self.url            = None  # type: URL
        self.name           = None  # type: str
        self.description    = None  # type: str
        self.is_nsfw        = None  # type: bool
        self.likes          = None  # type: int|False
        self.post_count     = None  # type: int
        self.updated        = None  # type: int

        self._posts = []
        self.offset = 0

        self._api_url = self._api_url.replace(
            path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host)
        )
        self._api_get()
Esempio n. 27
0
 def test_constructor(self):
     # args
     self.assertEqual(URL('a://*****:*****@d:5/f?g#h'),
                      URL(None, 'a', 'b:c', 'd', '5', '/f', 'g', 'h'))
     # kwargs
     self.assertEqual(
         URL('a://*****:*****@d:5/f?g#h'),
         URL(scheme='a',
             userinfo='b:c',
             host='d',
             port='5',
             path='/f',
             query='g',
             fragment='h'))
     # ignore
     self.assertEqual(URL('//host'), URL('//host', scheme='sh', port='80'))
Esempio n. 28
0
    def __init__(self, base_url, **kwargs):
        u = URL(base_url)

        if not (u.scheme and u.host):
            raise EzReqURLError('Unsupported URL!')

        # Backup scheme
        self._scheme = u.scheme

        self._base_url = base_url
        self._session = Session()
        self._last_url = base_url

        # `self._headers` -> `self._session.headers`
        self._headers = self._session.headers

        headers = kwargs.pop('headers', {})
        self._session.headers.update(headers)

        max_retries = kwargs.pop('max_retries', 3)
        self._session.mount('http://', HTTPAdapter(max_retries=max_retries))
        self._session.mount('https://', HTTPAdapter(max_retries=max_retries))
Esempio n. 29
0
def normalize_image_url(url):
    """
    takes an s3 url or relative url and returns the part that is saved in the
    database (relative to the storage root).
    """
    if url.startswith("http://") or url.startswith("https://"):
        url = URL(url).path
        bucket = "/{}/".format(timelapse_storage.bucket_name)
        if url.startswith(bucket):
            url = url[len(bucket):]
        if url.startswith(timelapse_storage.location):
            url = url[len(timelapse_storage.location):]
    if hasattr(timelapse_storage, "base_url") and url.startswith(
            timelapse_storage.base_url):
        url = url[len(timelapse_storage.base_url):]
    if url.startswith("/"):
        url = url[1:]
    return url
Esempio n. 30
0
    def wrapped_fn(self, url, **kwargs):
        u = URL(url)

        # Fix params not appears in referer
        try:
            params = kwargs.pop('params')
            url = '{url}?{params}'.format(url=url, params=urlencode(params))
        except KeyError:
            pass

        if u.scheme and u.host:
            # pylint: disable=protected-access
            self._base_url = str(u.replace(full_path=''))
            # pylint: disable=protected-access
            self._scheme = u.scheme  # Update scheme

        if url.startswith(r'//'):
            # '//example.com'
            # pylint: disable=protected-access
            url = '{scheme}:{where}'.format(scheme=self._scheme, where=url)
            self._base_url = url  # pylint: disable=protected-access
        elif url.startswith(r'?'):
            # '?page=rss'
            url = '/' + url  # -> '/?page=rss'
            url = urljoin(self._base_url, url)  # pylint: disable=protected-access
        else:
            # '/?page=rss' 'page=rss'
            url = urljoin(self._base_url, url)  # pylint: disable=protected-access

        # pylint: disable=protected-access
        u = URL(self._last_url)

        # pylint: disable=protected-access
        self._headers.update({
            # HTTP/2 Headers lowercase only
            'origin': str(u.replace(full_path='')),
            'referer': self._last_url
        })

        self._last_url = url  # pylint: disable=protected-access
        return fn(self, url, **kwargs)
Esempio n. 31
0
class Scanner(object):

    FINGERPRINTS = [
        {
            "type": "git",
            "base": ".git",
            "files": ["index"]
        },
        {
            "type": "svn",
            "base": ".svn",
            "files": ["wc.db"]
        },
        {
            "type": "svn_old",
            "base": ".svn",
            "files": ["entries"]
        },
        #{
        #    "type": "hg",
        #    "base": ".hg",
        #    "files": ["store/00manifest.i"]
        #}
    ]

    SCHEMES = ["HTTP", "HTTPS"]

    def __init__(self, host):
        self.host = URL(host).replace(path = "", query = "", fragment = "")
        self.session = HTTP()
        self.session.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"

    def scan_host(self):
        for scheme in self.SCHEMES:
            for fingerprint in self.FINGERPRINTS:
                for file in fingerprint['files']:
                    url = self.host.replace(path = fingerprint['base'] + "/" + file, scheme = scheme)
                    url = str(url)

                    response = self.session.get(url, verify=False)

                    if response.status_code == 200 and self._filter_false_positive(response.content, fingerprint['type']):
                        return {
                                "file": file,
                                "type": fingerprint['type'],
                                "scheme": scheme,
                                "data": response.content,
                                "host": self.host.replace(scheme = scheme)
                        }
                    else:
                        pass
                        if(response.status_code == 200):
                            pass
                            #print "Failed: File exists, but failed verification."
                        else:
                            pass
                            #print "Failed: HTTP " + str(response.status_code)

        return False

    def _filter_false_positive(self, data, type):
        if "<html" in data and "</html>" in data:
            return False

        if type == "git":
            if data[0:4] != "DIRC":
                return False

        if type == "svn_old":
            if "dir" not in data or "file" not in data:
                return False

        if type == "svn":
            if data[0:13] != "SQLite format":
                return False

        if type == "hg":
            if not data.statswith(".hgtag"):
                return False

        return True
Esempio n. 32
0
class TumblrBlog:

    def __init__(self, url, session=None, **kwargs):
        """
        Tumblr blog

        Args:
            url(URL|str): Tumblr profile URL
            session(Optional[Session]): An optional custom Requests session

        Keyword Args:
            api_key(str): Tumblr API key
            uagent(str): Custom User-Agent header
        """
        self._url = url if isinstance(url, URL) else URL(url)
        self._api_url = URL(scheme='https', host='api.tumblr.com', path='/v2/')
        self._api_response = None  # type: Response
        self._api_key = kwargs.get('api_key', 'fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4')
        self._uagent = kwargs.get('user_agent', 'tumdlr/{version}')

        if not session:
            session = Session()
            session.headers.update({
                'Referer': urllib.parse.quote(self._url.as_string()),
                'User-Agent': self._uagent.format(version=__version__)
            })

        self.session = session

        self.title          = None  # type: str
        self.url            = None  # type: URL
        self.name           = None  # type: str
        self.description    = None  # type: str
        self.is_nsfw        = None  # type: bool
        self.likes          = None  # type: int|False
        self.post_count     = None  # type: int
        self.updated        = None  # type: int

        self._posts = []
        self.offset = 0

        self._api_url = self._api_url.replace(
            path=self._api_url.path + 'blog/{host}/posts'.format(host=self._url.host)
        )
        self._api_get()

    def _api_get(self, query=None, parse=True):
        """
        Execute an API query

        Args:
            query(Optional[dict]): Extra query parameters
            parse(Optional[bool]): Parse the API response immediately
        """
        # Parse extra query parameters
        query_extra = []

        if query:
            for key, value in query.items():
                query_extra.append(
                    '{key}={value}'.format(
                        key=urllib.parse.quote(key),
                        value=urllib.parse.quote(value)
                    )
                )

        # Only prepend an ampersand if we have extra attributes, otherwise default to an empty string
        if query_extra:
            query_extra = '&' + '&'.join(query_extra)
        else:
            query_extra = ''

        endpoint = self._api_url.replace(
            query='api_key={api_key}&filter=text&offset={offset}{extra}'.format(
                api_key=self._api_key, offset=self.offset, extra=query_extra
            )
        )

        response = self.session.get(endpoint.as_string())  # type: Response
        response.raise_for_status()

        self._api_response = response
        if parse:
            self._api_parse_response()

    def _api_parse_response(self):
        """
        Parse an API response

        """
        blog = self._api_response.json()['response']['blog']

        self.title          = blog['title']
        self.url            = URL(blog['url'])
        self.name           = blog['name']
        self.description    = blog['description']
        self.is_nsfw        = blog['is_nsfw']
        self.likes          = blog.get('likes', False)  # Returned only if sharing of likes is enabled
        self.post_count     = blog['posts']
        self.updated        = blog['updated']

        posts = self._api_response.json()['response']['posts']

        for post in posts:
            try:
                if post['type'] in ['photo', 'link']:
                    self._posts.append(TumblrPhotoSet(post, self))
                    continue
                elif post['type'] == 'video':
                    self._posts.append(TumblrVideoPost(post, self))
                    continue

                self._posts.append(TumblrPost(post, self))
            except TumdlrParserError:
                continue

    def posts(self):
        """
        Yields:
            TumblrPost
        """
        while True:
            # Out of posts?
            if not self._posts:
                # Do we have any more to query?
                self._api_get()

                if not self._posts:
                    # Nope, we've queried everything, break now
                    break

            # Pop our next post and increment the offset
            post = self._posts.pop(0)
            self.offset += 1

            yield post
Esempio n. 33
0
class TumblrPost:
    """
    This is the base container class for all Tumblr post types. It contains data that is always available with any
    type of post.

    Additional supported post types may extend this class to provide additional metadata parsing
    """
    def __init__(self, post, blog):
        """
        Args:
            post(dict): API response
            blog(tumdlr.api.TumblrBlog): Parent blog
        """
        self._post = post
        self.blog = blog
        self.log = logging.getLogger('tumdlr.containers.post')

        self.id         = None  # type: int
        self.type       = None  # type: str
        self.url        = None  # type: URL
        self.tags       = set()
        self.post_date  = None  # type: str
        self.note_count = None  # type: int

        self.files = []

        try:
            self._parse_post()
        except Exception as e:
            self.log.warn('Failed to parse post data: %r', self, exc_info=e)
            raise TumdlrParserError(post_data=post)

    @property
    def is_text(self):
        """
        Returns:
            bool
        """
        return self.type == 'text'

    @property
    def is_photo(self):
        """
        Returns:
            bool
        """
        return self.type in ['photo', 'link']

    @property
    def is_video(self):
        """
        Returns:
            bool
        """
        return self.type == 'video'

    def _parse_post(self):
        self.id         = self._post['id']
        self.type       = self._post['type']
        self.url        = URL(self._post['post_url']) if 'post_url' in self._post else None
        self.tags       = set(self._post.get('tags', []))
        self.note_count = self._post.get('note_count')
        self.post_date  = self._post['date']

    def __repr__(self):
        return "<TumblrPost id='{id}' type='{type}' url='{url}'>"\
            .format(id=self.id, type=self.type, url=self.url)

    def __str__(self):
        return self.url.as_string() if self.url else ''
Esempio n. 34
0
 def test_full_path(self):
     for url in [
             '', 'path', 'path?query', 'path#fragment',
             'path?query#fragment', '?query', '#fragment', '?query#fragment'
     ]:
         self.assertEqual(URL(url).full_path, url)
Esempio n. 35
0
 def __init__(self, host):
     self.host = URL(host).replace(path = "", query = "", fragment = "")
     self.session = HTTP()
     self.session.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
Esempio n. 36
0
 def test_str(self):
     for url in [
             '', '//host', '//host/'
             'scheme://host', '//host/path', '?query', 'path?query',
             'http:', 'http:?query', '//host?query'
     ]:
         self.assertEqual(str(URL(url)), url)
         self.assertEqual(URL(str(URL(url))), URL(url))
     # should append slash to path
     self.assertEqual(str(URL(host='host', path='path')), '//host/path')
     self.assertEqual(str(URL(host='host', path='//path')), '//host//path')
     self.assertEqual(str(URL(path='//path').validate()), '////path')
     self.assertEqual(str(URL(path='//pa:th').validate()), '////pa:th')
     self.assertEqual(str(URL(path='pa:th').validate()), './pa:th')
     self.assertEqual(str(URL(path='not/pa:th').validate()), 'not/pa:th')
     self.assertEqual(str(URL(path='pa:th/not').validate()), './pa:th/not')
Esempio n. 37
0
 def test_ip(url, host_ip, host_ipv4):
     self.assertEqual(URL(url).is_host_ip(), host_ip)
     self.assertEqual(URL(url).is_host_ipv4(), host_ipv4)
Esempio n. 38
0
 def test_valid(url, relative, relative_path):
     self.assertEqual(URL(url).is_relative(), relative)
     self.assertEqual(URL(url).is_relative_path(), relative_path)
Esempio n. 39
0
    def test_replace(self):
        for url in [
                URL('htttp://[email protected]:8080/path?query#fragment'),
                URL(),
                URL('path'),
                URL('//host:80')
        ]:
            self.assertFalse(url is url.replace(host='strange'))
            self.assertEqual(url, url.replace())
            for idx, (field, value) in enumerate(zip(url._fields, url._data)):
                # replase to same
                self.assertEqual(url.replace(**{field: value}), url)
                # clear
                self.assertEqual(url.replace(**{field: ''})[idx], '')
                # replace to some
                if url.has_authority() and field == 'path':
                    self.assertEqual(url.replace(**{field: 'an'})[idx], '/an')
                else:
                    self.assertEqual(url.replace(**{field: 'an'})[idx], 'an')

        for url, authority in [(URL('a://*****:*****@d:5/f?g#h'), 'blah'),
                               (URL('a://blah/f?g#h'), '')]:
            orig_autho = url.authority
            url = url.replace(authority=authority)
            self.assertEqual(url.authority, authority)
            url = url.replace(authority=orig_autho)
            self.assertEqual(url.authority, orig_autho)

        for url, full_path in [(URL('a://*****:*****@d:5/f?g#h'), ''),
                               (URL('a://*****:*****@d:5/f?g#h'), '/path'),
                               (URL('a://*****:*****@d:5/f?g#h'), '/path?qr'),
                               (URL('a://*****:*****@d:5/f?g#h'), '?qr'),
                               (URL('a://*****:*****@d:5/f?g#h'), '?qr#fr'),
                               (URL('a://*****:*****@d:5/f?g#h'), '#fr'),
                               (URL('a://*****:*****@d:5'), '/path')]:
            orig_path = url.full_path
            url = url.replace(full_path=full_path)
            self.assertEqual(url.full_path, full_path)
            url = url.replace(full_path=orig_path)
            self.assertEqual(url.full_path, orig_path)
Esempio n. 40
0
    def test_setdefault(self):
        empty = URL()
        full1 = URL('scheme://user@host:80/path?query#frgment')
        full2 = URL('an://oth@er:33/full?url#!!')

        self.assertEqual(empty.setdefault(*full1._data), full1)
        self.assertEqual(full1.setdefault(*full2._data), full1)

        for idx, (field, value) in enumerate(zip(full1._fields, full1._data)):
            self.assertEqual(empty.setdefault(**{field: value}),
                             empty.replace(**{field: value}))
            self.assertEqual(empty.setdefault(**{field: value})[idx], value)
            self.assertEqual(
                full2.setdefault(**{field: value})[idx], full2[idx])