Exemple #1
0
    def test_strict_escape(self):
        '''Test strict mode escaping'''
        examples = [
            ('danny%27s pub'                , 'danny%27s%20pub'                  ),
            ('http://*****:*****@foo.com'     , 'http://*****:*****@foo.com'         ),
            (u'http://José:no [email protected]'  , 'http://Jos%C3%A9:no%[email protected]'),
            ('http://oops!:don%[email protected]' , 'http://oops!:don%[email protected]'     ),
            (u'española,nm%2cusa.html?gunk=junk+glunk&foo=bar baz',
                'espa%C3%B1ola,nm%2Cusa.html?gunk=junk+glunk&foo=bar%20baz')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(
                url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good)

        # Examples with userinfo
        examples = [
            ('http://user%[email protected]/', 'http://user%[email protected]/')
        ]
        for bad, good in examples:
            self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(
                url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good)

        # Test Unicode escaping in strict mode
        u = url.URL(u'http', u'foo.com', None, u'española,nm%2cusa.html', u'', u'gunk=junk+glunk&foo=bar baz', u'')
        u.escape(strict=True)
        self.assertTrue(isinstance(u._path, str))
        self.assertEqual(u._path, 'espa%C3%B1ola,nm%2Cusa.html')
Exemple #2
0
 def test(rules, example, pld, tld):
     try:
         url.set_psl(rules)
         assert_equal(url.parse(example).pld, pld)
         assert_equal(url.parse(example).tld, tld)
     finally:
         url.set_psl(pkgutil.get_data('url', 'psl/2016-08-16.psl'))
Exemple #3
0
 def test(rules, example, pld, tld):
     try:
         url.set_psl(rules)
         assert_equal(url.parse(example).pld, pld)
         assert_equal(url.parse(example).tld, tld)
     finally:
         url.set_psl(pkgutil.get_data('url', 'psl/2016-08-16.psl'))
Exemple #4
0
 def __init__(self, target, **params):
     if isinstance(target, basestring):
         self._host = url.parse(target)
     elif isinstance(target, (tuple, list)):
         self._host = url.parse('http://%s:%s/' % target)
     else:
         raise TypeError('Host must be a string or tuple')
     self._params = params
Exemple #5
0
 def test(uni, puny):
     assert_equal(url.parse(uni).escape().punycode().utf8(), puny)
     # Also make sure punycode is idempotent
     assert_equal(
         url.parse(uni).escape().punycode().punycode().utf8(), puny)
     # Make sure that we can reverse the procedure correctly
     assert_equal(
         url.parse(uni).escape().punycode().unpunycode().unescape(), uni)
     # And we get what we'd expect going the opposite direction
     assert_equal(url.parse(puny).unescape().unpunycode().unicode(), uni)
Exemple #6
0
    def test_str_repr(self):
        '''Make sure str and repr produce reasonable results'''
        examples = [
            ('http://foo.com/', 'http://foo.com/'),
            ('http://FOO.com/', 'http://foo.com/')
        ]

        for toparse, strng in examples:
            self.assertEqual(str(url.parse(toparse)), strng)
            self.assertEqual(repr(url.parse(toparse)),
                '<url.URL object "%s" >' % strng)
Exemple #7
0
    def test_str_repr(self):
        '''Make sure str and repr produce reasonable results'''
        examples = [
            ('http://foo.com/', 'http://foo.com/'),
            ('http://FOO.com/', 'http://foo.com/')
        ]

        for toparse, strng in examples:
            self.assertEqual(str(url.parse(toparse)), strng)
            self.assertEqual(repr(url.parse(toparse)),
                '<url.URL object "%s" >' % strng)
Exemple #8
0
 def test(example):
     assert_equal(url.parse(example).escape().punycode().unicode, example)
     # Also make sure punycode is idempotent
     assert_equal(
         url.parse(example).escape().punycode().punycode().unicode, example)
     # Make sure that we can reverse the procedure correctly
     assert_equal(
         url.parse(example).escape().punycode().unpunycode().unescape(),
         example)
     # And we get what we'd expect going the opposite direction
     assert_equal(
         url.parse(example).unescape().unpunycode().unicode, example)
Exemple #9
0
 def test_empty_hostname(self):
     '''Allow empty hostnames'''
     examples = [
         'http:///path',
         'http://userinfo@/path',
         'http://:80/path',
     ]
     for example in examples:
         # Equal to itself
         self.assertEqual(url.parse(example), example)
         # String representation equal to the provided example
         self.assertEqual(url.parse(example).utf8(), example)
Exemple #10
0
 def test_empty_hostname(self):
     '''Allow empty hostnames'''
     examples = [
         'http:///path',
         'http://userinfo@/path',
         'http://:80/path',
     ]
     for example in examples:
         # Equal to itself
         self.assertEqual(url.parse(example), example)
         # String representation equal to the provided example
         self.assertEqual(url.parse(example).utf8(), example)
Exemple #11
0
 def test(uni, puny, upuny, epuny):
     assert_equal(url.parse(uni).escape().punycode().utf8(), epuny)
     # Also make sure punycode is idempotent
     assert_equal(
         url.parse(uni).escape().punycode().punycode().utf8(), epuny)
     # Make sure that we can reverse the procedure correctly
     assert_equal(
         url.parse(uni).escape().punycode().unpunycode().unescape(),
         uni)
     # And we get what we'd expect going the opposite direction
     assert_equal(
         url.parse(puny).unescape().unpunycode().unicode(), uni)
Exemple #12
0
    def _initialize_crawl(self):
        if self.crawl_requires_gevent:
            from gevent.monkey import saved
            if 'socket' not in saved:
                # we're not gevent-monkey-patched
                raise RuntimeError(
                    "Spider.crawl() needs gevent monkey patching to have been applied"
                )

        self._resume_queue()

        if len(
                list(
                    self._scraper.cache_storage._conn.execute(
                        "SELECT * FROM seen LIMIT 1"))
        ) == 0 and self._queue.empty():
            # we're at the beginning, so start with the home page
            # follow any homepage redirects, so we get the right protocol and domain
            tmp_response = requests.get("http://%s/" % self.domain)

            first_url = moz_url.parse(tmp_response.url)
            if first_url._host not in self._allowed_hosts:
                self._allowed_hosts.add(first_url._host)

            self._add_to_queue(first_url)
Exemple #13
0
def _response_to_features(response):
    features = set()
    tree = etree.HTML(response.text)

    for item in tree.iter(tag=etree.Element):
        features.add("tag-%s" % item.tag)

        if "class" in item.attrib and item.attrib["class"].strip():
            classes = whitespace.split(item.attrib["class"])
            for _c in classes:
                c = _c.strip()
                if c:
                    features.add("class-%s" % c)

        if "id" in item.attrib:
            features.add("id-%s" % item.attrib["id"])

    # path parts
    u = moz_url.parse(response.url)
    path = u._path.split("/")[1:]
    for idx, part in enumerate(path):
        features.add("path-%s-%s" % (idx, path))

    if u._query:
        for k, vl in urlparse.parse_qs(u._query).iteritems():
            features.add("qse-%s" % k)
            for v in vl:
                features.add("qsv-%s-%s" % (k, v))

    return features
Exemple #14
0
def canonical_url(uri):
    """
    Return the canonical representation of a given URI.
    This assumes the `uri` has a scheme.

    * When a default port corresponding for the scheme is explicitly declared
      (such as port 80 for http), the port will be removed from the output.
    * Fragments '#' are not removed.
     * Params and query string arguments are not reordered.
    """
    try:
        parsed = urlpy.parse(uri)
        if not parsed:
            return
        if not (getattr(parsed, '_scheme', None)
                and getattr(parsed, '_host', None)):
            return

        if TRACE: logger_debug('canonical_url: parsed:', parsed)
        sanitized = parsed.sanitize()
        if TRACE:
            logger_debug('canonical_url: sanitized:', sanitized)

        punycoded = sanitized.punycode()
        if TRACE:
            logger_debug('canonical_url: punycoded:', punycoded)

        if punycoded._port == urlpy.PORTS.get(punycoded._scheme, None):
            punycoded._port = None
        return punycoded.utf8()
    except Exception as e:
        if TRACE:
            logger_debug('canonical_url: failed for:', uri, 'with:', repr(e))
        # ignore it
        pass
Exemple #15
0
def _response_to_features(response):
    features = set()
    tree = etree.HTML(response.text)

    for item in tree.iter(tag=etree.Element):
        features.add("tag-%s" % item.tag)

        if 'class' in item.attrib and item.attrib['class'].strip():
            classes = whitespace.split(item.attrib['class'])
            for _c in classes:
                c = _c.strip()
                if c:
                    features.add("class-%s" % c)

        if 'id' in item.attrib:
            features.add("id-%s" % item.attrib['id'])

    # path parts
    u = moz_url.parse(response.url)
    path = u._path.split("/")[1:]
    for idx, part in enumerate(path):
        features.add('path-%s-%s' % (idx, path))

    if u._query:
        for k, vl in urlparse.parse_qs(u._query).iteritems():
            features.add('qse-%s' % k)
            for v in vl:
                features.add('qsv-%s-%s' % (k, v))

    return features
Exemple #16
0
    def test_abspath(self):
        '''Make sure absolute path checking works correctly'''
        examples = [
            ('howdy'           , 'howdy'        ),
            ('hello//how//are' , 'hello/how/are'),
            ('hello/../how/are', 'how/are'      ),
            ('hello//..//how/' , 'how/'         ),
            ('a/b/../../c'     , 'c'            ),
            ('../../../c'      , 'c'            ),
            ('./hello'         , 'hello'        ),
            ('./././hello'     , 'hello'        ),
            ('a/b/c/'          , 'a/b/c/'       ),
            ('a/b/c/..'        , 'a/b/'         ),
            ('a/b/.'           , 'a/b/'         ),
            ('a/b/./././'      , 'a/b/'         ),
            ('a/b/../'         , 'a/'           ),
            ('.'               , ''             ),
            ('../../..'        , ''             ),
            ('////foo'         , 'foo'          )
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).abspath().utf8(), good)
Exemple #17
0
    def filter_links(self, links):
        filteredLinks = []

        for link in links:
			# if link is a directory, then follow it
            # TODO: "?dir=" is provided to properly recognize as dirs the ones which
            #       are specified as queries (it is an in-place fix and should be removed)
            if link.url.endswith("/") or link.url.find("?dir=")>=0:
                filteredLinks.append(link)

            # if not, verify whether it is a video file: if it is then save it, otherwise skip
            else:
                if isVideoURL(link.url):
                    # normalize the URL
                    # normLinkURL = link.url
                    normLinkURL = url.parse(link.url).canonical().escape().punycode().utf8()

                    # save the url... but only if it has not been indexed yet
                    # check if the URL exists in redis
                    if not self.r.exists(normLinkURL):
                        # if not, add it to the toIndex queue
                        # (NOTE: it might be already present in toIndex, but we don't mind as it is a set)
                        self._logger.info("sadd %s %s " % (self._conf['key_toIndex'],normLinkURL))
                        self.r.sadd(self._conf['key_toIndex'], normLinkURL)
        return filteredLinks
Exemple #18
0
    def test_abspath(self):
        '''Make sure absolute path checking works correctly'''
        examples = [
            ('howdy'           , 'howdy'        ),
            ('hello//how//are' , 'hello/how/are'),
            ('hello/../how/are', 'how/are'      ),
            ('hello//..//how/' , 'how/'         ),
            ('a/b/../../c'     , 'c'            ),
            ('../../../c'      , 'c'            ),
            ('./hello'         , 'hello'        ),
            ('./././hello'     , 'hello'        ),
            ('a/b/c/'          , 'a/b/c/'       ),
            ('a/b/c/..'        , 'a/b/'         ),
            ('a/b/.'           , 'a/b/'         ),
            ('a/b/./././'      , 'a/b/'         ),
            ('a/b/../'         , 'a/'           ),
            ('.'               , ''             ),
            ('../../..'        , ''             ),
            ('////foo'         , 'foo'          )
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).abspath().utf8(), good)
def save(url_, path="", wait=60):
    if hasattr(url_, "url"):
        url_ = url_.url
    if len(path) < 5 or "." not in path[-5:-3]:
        file = url.parse(str(url_)).filename
        path = os.path.join(path, file)
    open(path, "w").write(download(url_, wait))
    return path
Exemple #20
0
def save(url_, path="", wait=60):
    if hasattr(url_, "url"):
        url_ = url_.url
    if len(path) < 5 or "." not in path[-5:-3]:
        file = url.parse(str(url_)).filename
        path = os.path.join(path, file)
    open(path, "w").write(download(url_, wait))
    return path
Exemple #21
0
 def test_deuserinfo(self):
     '''Correctly removes userinfo'''
     examples = [
         ('http://*****:*****@foo.com/', 'http://foo.com/'),
         ('http://[email protected]/', 'http://foo.com/')
     ]
     for bad, good in examples:
         self.assertEqual(url.parse(bad).deuserinfo().utf8(), good)
Exemple #22
0
 def test_deuserinfo(self):
     '''Correctly removes userinfo'''
     examples = [
         ('http://*****:*****@foo.com/', 'http://foo.com/'),
         ('http://[email protected]/', 'http://foo.com/')
     ]
     for bad, good in examples:
         self.assertEqual(url.parse(bad).deuserinfo().utf8(), good)
Exemple #23
0
 def test_tld(self):
     '''Test the pay-level domain functionality'''
     examples = [
         ('http://foo.com/bar'    , 'com'),
         ('http://bar.foo.com/bar', 'com'),
         ('/foo'                  , '')
     ]
     for query, result in examples:
         self.assertEqual(url.parse(query).tld(), result)
Exemple #24
0
 def test_tld(self):
     '''Test the pay-level domain functionality'''
     examples = [
         ('http://foo.com/bar'    , 'com'),
         ('http://bar.foo.com/bar', 'com'),
         ('/foo'                  , '')
     ]
     for query, result in examples:
         self.assertEqual(url.parse(query).tld(), result)
Exemple #25
0
def handle(request):

    # URL Parsing
    s = "api."
    if 'project' in request.match_info:
        p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['project'])
        s += p + "."
    if 'module' in request.match_info:
        p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['module'])
        s += p + "."
    if 'action' in request.match_info:
        p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['action'])
        s += p
    if verbose_logging:
        print("Incoming request - " + str(request.raw_path))

    # Session loading
    if "cookie" in request.headers and "AIOHTTP_SESSION" in request.headers[
            "cookie"]:
        cookies = request.headers.getall("cookie")[0]
        aiocookie = re.search("AIOHTTP_SESSION=([0-9a-z]{32})", cookies)
        if aiocookie:
            session = sessions.Session(aiocookie.group(1))
            if verbose_logging:
                print("Session ID - " + session.id)
        else:
            session = sessions.Session()
            if verbose_logging:
                print("New Session with ID - " + session.id)
    else:
        session = sessions.Session()
        if verbose_logging:
            print("New Session with ID - " + session.id)

    # Response building
    try:
        module = importlib.import_module(s)
        arguments = url.parse(request.GET, module.arguments)
        response = yield from module.process(session, arguments)
        headers = response.get('headers', {})
        if session.is_new_session:
            headers.update({"Set-Cookie": "AIOHTTP_SESSION=" + session.id})
        if 'json' in response:
            # Dump to json if the module wants to return json
            return respond(headers=headers,
                           status=response.get('status', 200),
                           text=json.dumps(response.get("json", "")),
                           content_type="application/json")
        else:
            return respond(headers=headers,
                           status=response.get('status', 200),
                           text=response.get("text", ""))
    except ImportError:
        return respond(status=404, text="Page does not exist")
    except Exception as e:
        s = str(e.args) if verbose_errors else "No verbose errors."
        return respond(status=500, text="Error while querying data.\n" + s)
Exemple #26
0
 def test(first, second):
     # Equiv with another URL object
     assert url.parse(first).equiv(url.parse(second))
     # Equiv with a string
     assert url.parse(first).equiv(second)
     # Make sure it's also symmetric
     assert url.parse(second).equiv(url.parse(first))
     # Symmetric with string arg
     assert url.parse(second).equiv(first)
     # Should be equivalent to self
     assert url.parse(first).equiv(first)
     assert url.parse(second).equiv(second)
Exemple #27
0
 def test(first, second):
     # Equiv with another URL object
     assert url.parse(first).equiv(url.parse(second))
     # Equiv with a string
     assert url.parse(first).equiv(second)
     # Make sure it's also symmetric
     assert url.parse(second).equiv(url.parse(first))
     # Symmetric with string arg
     assert url.parse(second).equiv(first)
     # Should be equivalent to self
     assert url.parse(first).equiv(first)
     assert url.parse(second).equiv(second)
Exemple #28
0
    def test_absolute(self):
        '''Can it recognize if it's a relative or absolute url?'''
        examples = [
            ('http://foo.com/bar', True ),
            ('foo/'              , False),
            ('http://foo.com'    , True ),
            ('/foo/bar/../'      , False)
        ]

        for query, result in examples:
            self.assertEqual(url.parse(query).absolute(), result)
Exemple #29
0
    def test_escape(self):
        '''Make sure we escape paths correctly'''
        examples = [
            ('hello%20and%20how%20are%20you', 'hello%20and%20how%20are%20you'),
            ('danny\'s pub'                 , 'danny%27s%20pub'              ),
            ('danny%27s pub?foo=bar&yo'     , 'danny%27s%20pub?foo=bar&yo'   ),
            # Thanks to @myronmarston for these test cases
            ('foo?bar none=foo bar'         , 'foo?bar%20none=foo%20bar'     ),
            ('foo;a=1;b=2?a=1&b=2'          , 'foo;a=1;b=2?a=1&b=2'          ),
            ('foo?bar=["hello","howdy"]'    ,
                'foo?bar=%5B%22hello%22,%22howdy%22%5D'),
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).escape().utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(url.parse(bad).escape().escape().utf8(), good)
Exemple #30
0
 def test_lower(self):
     '''Can lowercase the domain name correctly'''
     examples = [
         ('www.TESTING.coM'    , 'www.testing.com/'   ),
         ('WWW.testing.com'    , 'www.testing.com/'   ),
         ('WWW.testing.com/FOO', 'www.testing.com/FOO')
     ]
     for bad, good in examples:
         bad = 'http://' + bad
         good = 'http://' + good
         self.assertEqual(url.parse(bad).utf8(), good)
Exemple #31
0
 def test_userinfo(self):
     '''Allow a userinfo section'''
     examples = [
         ('http://*****:*****@foo.com',   'http://*****:*****@foo.com'),
         ('http://[email protected]', 'http://[email protected]')
     ]
     suffix = '/page.html'
     for bad, good in examples:
         bad = bad + suffix
         good = good + suffix
         self.assertEqual(url.parse(bad).utf8(), good)
Exemple #32
0
 def test_lower(self):
     '''Can lowercase the domain name correctly'''
     examples = [
         ('www.TESTING.coM'    , 'www.testing.com/'   ),
         ('WWW.testing.com'    , 'www.testing.com/'   ),
         ('WWW.testing.com/FOO', 'www.testing.com/FOO')
     ]
     for bad, good in examples:
         bad = 'http://' + bad
         good = 'http://' + good
         self.assertEqual(url.parse(bad).utf8(), good)
Exemple #33
0
    def test_absolute(self):
        '''Can it recognize if it's a relative or absolute url?'''
        examples = [
            ('http://foo.com/bar', True ),
            ('foo/'              , False),
            ('http://foo.com'    , True ),
            ('/foo/bar/../'      , False)
        ]

        for query, result in examples:
            self.assertEqual(url.parse(query).absolute(), result)
Exemple #34
0
    def test_defrag(self):
        '''Correctly defrags urls'''
        examples = [
            ('foo#bar', 'foo')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).defrag().utf8(), good)
Exemple #35
0
    def test_sanitize(self):
        '''Make sure the sanitize method does all that it should'''
        examples = [
            ('../foo/bar none', 'foo/bar%20none')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).sanitize().utf8(), good)
Exemple #36
0
    def test_sanitize(self):
        '''Make sure the sanitize method does all that it should'''
        examples = [
            ('../foo/bar none', 'foo/bar%20none')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).sanitize().utf8(), good)
Exemple #37
0
 def test_userinfo(self):
     '''Allow a userinfo section'''
     examples = [
         ('http://*****:*****@foo.com',   'http://*****:*****@foo.com'),
         ('http://[email protected]', 'http://[email protected]')
     ]
     suffix = '/page.html'
     for bad, good in examples:
         bad = bad + suffix
         good = good + suffix
         self.assertEqual(url.parse(bad).utf8(), good)
Exemple #38
0
    def test_defrag(self):
        '''Correctly defrags urls'''
        examples = [
            ('foo#bar', 'foo')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).defrag().utf8(), good)
Exemple #39
0
def url_host_domain(url):
    """
    Return a tuple of the (host, domain) of a URL or None. Assumes that the
    URL has a scheme.
    """
    parsed = urlpy.parse(url)
    host = parsed._host
    if not host:
        return None, None
    host = host.lower()
    domain = parsed.pld().lower()
    return host, domain
Exemple #40
0
    def test_canonical(self):
        '''Correctly canonicalizes urls'''
        examples = [
            ('?b=2&a=1&c=3', '?a=1&b=2&c=3'),
            (';b=2;a=1;c=3', ';a=1;b=2;c=3')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).canonical().utf8(), good)
Exemple #41
0
    def test_canonical(self):
        '''Correctly canonicalizes urls'''
        examples = [
            ('?b=2&a=1&c=3', '?a=1&b=2&c=3'),
            (';b=2;a=1;c=3', ';a=1;b=2;c=3')
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).canonical().utf8(), good)
Exemple #42
0
def url_host_domain(url):
    """
    Return a tuple of the (host, domain) of a URL or None. Assumes that the
    URL has a scheme.
    """
    parsed = urlpy.parse(url)
    host = parsed._host
    if not host:
        return None, None
    host = host.lower()
    domain = parsed.pld().lower()
    return host, domain
Exemple #43
0
    def test_punycode(self):
        '''Make sure punycode encoding works correctly'''
        examples = [
            (u'http://www.kündigen.de/',
                'http://www.xn--kndigen-n2a.de/'),
            (u'http://россия.иком.museum/',
                'http://xn--h1alffa9f.xn--h1aegh.museum/'),
            (u'http://россия.иком.museum/испытание.html',
                'http://xn--h1alffa9f.xn--h1aegh.museum/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.html')
        ]

        for uni, puny in examples:
            self.assertEqual(url.parse(uni).escape().punycode().utf8(), puny)
            # Also make sure punycode is idempotent
            self.assertEqual(
                url.parse(uni).escape().punycode().punycode().utf8(), puny)
            # Make sure that we can reverse the procedure correctly
            self.assertEqual(
                url.parse(uni).escape().punycode().unpunycode().unescape(),
                uni)
            # And we get what we'd expect going the opposite direction
            self.assertEqual(
                url.parse(puny).unescape().unpunycode().unicode(), uni)

        # Make sure that we can't punycode or unpunycode relative urls
        examples = ['foo', '../foo', '/bar/foo']
        for relative in examples:
            self.assertRaises(TypeError, url.parse(relative).punycode)
            self.assertRaises(TypeError, url.parse(relative).unpunycode)
Exemple #44
0
    def test_punycode(self):
        '''Make sure punycode encoding works correctly'''
        examples = [
            (u'http://www.kündigen.de/',
                'http://www.xn--kndigen-n2a.de/'),
            (u'http://россия.иком.museum/',
                'http://xn--h1alffa9f.xn--h1aegh.museum/'),
            (u'http://россия.иком.museum/испытание.html',
                'http://xn--h1alffa9f.xn--h1aegh.museum/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.html')
        ]

        for uni, puny in examples:
            self.assertEqual(url.parse(uni).escape().punycode().utf8(), puny)
            # Also make sure punycode is idempotent
            self.assertEqual(
                url.parse(uni).escape().punycode().punycode().utf8(), puny)
            # Make sure that we can reverse the procedure correctly
            self.assertEqual(
                url.parse(uni).escape().punycode().unpunycode().unescape(),
                uni)
            # And we get what we'd expect going the opposite direction
            self.assertEqual(
                url.parse(puny).unescape().unpunycode().unicode(), uni)

        # Make sure that we can't punycode or unpunycode relative urls
        examples = ['foo', '../foo', '/bar/foo']
        for relative in examples:
            self.assertRaises(TypeError, url.parse(relative).punycode)
            self.assertRaises(TypeError, url.parse(relative).unpunycode)
Exemple #45
0
def normalize(url):
    """ Uses the Moz URL library to normalise and strip the URLs of
        extraneous information, and the urlparse library to ensure it
        is not a blank URL.
    """

    if url[:4] != 'http':
        url = 'http://'+url
    url = url.lower()
    url_parts = urlparse(url)
    if url_parts.netloc:
        url_obj = parse(url).defrag().abspath().canonical().punycode()
        return url_obj.utf8()
Exemple #46
0
def clean_url(u):
    u = url.parse(u)
    u.deparam(utm)
    u.strip()
    u.canonical()
    u.abspath()
    u.unescape()
    https_param = get_http_param(u.query)
    if len(https_param)==1:
        u = https_param.pop()
        return clean_url(u)
    u = str(u)
    return u
Exemple #47
0
def normalize(url):
    """ Uses the Moz URL library to normalise and strip the URLs of
        extraneous information, and the urlparse library to ensure it
        is not a blank URL.
    """

    if url[:4] != 'http':
        url = 'http://' + url
    url = url.lower()
    url_parts = urlparse(url)
    if url_parts.netloc:
        url_obj = parse(url).defrag().abspath().canonical().punycode()
        return url_obj.utf8()
Exemple #48
0
 def test(bad, good, ugood, egood):
     assert_equal(str(url.parse(bad).escape()), good)
     assert_equal(url.parse(bad).escape().utf8(), egood)
     assert_equal(url.parse(bad).escape().unicode(), ugood)
     # Escaping should also be idempotent
     assert_equal(str(url.parse(bad).escape().escape()), good)
     assert_equal(url.parse(bad).escape().escape().utf8(), egood)
     assert_equal(url.parse(bad).escape().escape().unicode(), ugood)
Exemple #49
0
    def test_strict_escape(self):
        '''Test strict mode escaping'''
        examples = [
            ('danny%27s pub'                , 'danny%27s%20pub'                  ),
            ('this%5Fand%5Fthat'            , 'this_and_that'                    ),
            ('http://*****:*****@foo.com'     , 'http://*****:*****@foo.com'         ),
            (u'http://José:no [email protected]'  , 'http://Jos%C3%A9:no%[email protected]'),
            ('http://oops!:don%[email protected]' , 'http://oops!:don%[email protected]'     ),
            (u'española,nm%2cusa.html?gunk=junk+glunk&foo=bar baz',
                'espa%C3%B1ola,nm%2Cusa.html?gunk=junk+glunk&foo=bar%20baz'),
            ('http://foo.com/bar\nbaz.html\n', 'http://foo.com/bar%0Abaz.html%0A'),
            ('http://foo.com/bar.jsp?param=\n/value%2F', 'http://foo.com/bar.jsp?param=%0A/value%2F'),
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(
                url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good)

        # Examples with userinfo
        examples = [
            ('http://user%[email protected]/', 'http://user%[email protected]/')
        ]
        for bad, good in examples:
            self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(
                url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good)

        # Test Unicode escaping in strict mode
        u = url.URL(u'http', u'foo.com', None, u'española,nm%2cusa.html', u'', u'gunk=junk+glunk&foo=bar baz', u'')
        u.escape(strict=True)
        self.assertTrue(isinstance(u._path, str))
        self.assertEqual(u._path, 'espa%C3%B1ola,nm%2Cusa.html')
Exemple #50
0
def test_component_assignment_unicode():
    parsed = url.parse('http://[email protected]:80/path;params?query#fragment')
    parsed.scheme = u'https'
    parsed.userinfo = u'username'
    parsed.host = u'foo.example.com'
    parsed.port = 443
    parsed.path = u'/another/path'
    parsed.params = u'no-params'
    parsed.query = u'no-query'
    parsed.fragment = u'no-fragment'
    assert_equal(
        parsed.unicode,
        'https://[email protected]:443/another/path;no-params?no-query#no-fragment'
    )
Exemple #51
0
def canonical_url(uri):
    """
    Return the canonical representation of a given URI.
    This assumes the `uri` has a scheme.
    
    * When a default port corresponding for the scheme is explicitly declared
      (such as port 80 for http), the port will be removed from the output.
    * Fragments '#' are not removed. 
     * Params and query string arguments are not reordered.
    """
    normalized = urlpy.parse(uri).sanitize().punycode()
    if normalized._port == urlpy.PORTS.get(normalized._scheme, None):
        normalized._port = None
    return normalized.utf8()
Exemple #52
0
def test_relative():
    def test(rel, absolute):
        assert_equal(base.relative(rel).utf8(), absolute)

    base = url.parse('http://testing.com/a/b/c')
    examples = [('../foo', 'http://testing.com/a/foo'),
                ('./foo', 'http://testing.com/a/b/foo'),
                ('foo', 'http://testing.com/a/b/foo'),
                ('/foo', 'http://testing.com/foo'),
                ('http://foo.com/bar', 'http://foo.com/bar'),
                (u'/foo', 'http://testing.com/foo')]

    for rel, absolute in examples:
        yield test, rel, absolute
Exemple #53
0
    def test_relative(self):
        '''Test relative url parsing'''
        base = url.parse('http://testing.com/a/b/c')
        examples = [
            ('../foo'            , 'http://testing.com/a/foo'  ),
            ('./foo'             , 'http://testing.com/a/b/foo'),
            ('foo'               , 'http://testing.com/a/b/foo'),
            ('/foo'              , 'http://testing.com/foo'    ),
            ('http://foo.com/bar', 'http://foo.com/bar'        ),
            (u'/foo'             , 'http://testing.com/foo'    )
        ]

        for rel, absolute in examples:
            self.assertEqual(base.relative(rel).utf8(), absolute)
Exemple #54
0
    def test_relative(self):
        '''Test relative url parsing'''
        base = url.parse('http://testing.com/a/b/c')
        examples = [
            ('../foo'            , 'http://testing.com/a/foo'  ),
            ('./foo'             , 'http://testing.com/a/b/foo'),
            ('foo'               , 'http://testing.com/a/b/foo'),
            ('/foo'              , 'http://testing.com/foo'    ),
            ('http://foo.com/bar', 'http://foo.com/bar'        ),
            (u'/foo'             , 'http://testing.com/foo'    )
        ]

        for rel, absolute in examples:
            self.assertEqual(base.relative(rel).utf8(), absolute)
Exemple #55
0
def test_component_assignment_unicode():
    parsed = url.parse('http://[email protected]:80/path;params?query#fragment')
    parsed.scheme = u'https'
    parsed.userinfo = u'username'
    parsed.host = u'foo.example.com'
    parsed.port = 443
    parsed.path = u'/another/path'
    parsed.params = u'no-params'
    parsed.query = u'no-query'
    parsed.fragment = u'no-fragment'
    assert_equal(
        parsed.unicode, 
        'https://[email protected]:443/another/path;no-params?no-query#no-fragment'
    )
Exemple #56
0
def canonical_url(uri):
    """
    Return the canonical representation of a given URI.
    This assumes the `uri` has a scheme.

    * When a default port corresponding for the scheme is explicitly declared
      (such as port 80 for http), the port will be removed from the output.
    * Fragments '#' are not removed.
     * Params and query string arguments are not reordered.
    """
    normalized = urlpy.parse(uri).sanitize().punycode()
    if normalized._port == urlpy.PORTS.get(normalized._scheme, None):
        normalized._port = None
    return normalized.utf8()
Exemple #57
0
    def test_escape(self):
        '''Make sure we escape paths correctly'''
        examples = [
            ('hello%20and%20how%20are%20you', 'hello%20and%20how%20are%20you'),
            ('danny\'s pub'                 , 'danny\'s%20pub'               ),
            ('danny%27s pub'                , 'danny\'s%20pub'               ),
            ('danny\'s pub?foo=bar&yo'      , 'danny\'s%20pub?foo=bar&yo'    ),
            ('hello%2c world'               , 'hello,%20world'               ),
            ('%3f%23%5b%5d'                 , '%3F%23%5B%5D'                 ),
            # Thanks to @myronmarston for these test cases
            ('foo?bar none=foo bar'         , 'foo?bar%20none=foo%20bar'     ),
            ('foo;a=1;b=2?a=1&b=2'          , 'foo;a=1;b=2?a=1&b=2'          ),
            ('foo?bar=["hello","howdy"]'    ,
                'foo?bar=%5B%22hello%22,%22howdy%22%5D'),
        ]

        base = 'http://testing.com/'
        for bad, good in examples:
            bad = base + bad
            good = base + good
            self.assertEqual(url.parse(bad).escape().utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(url.parse(bad).escape().escape().utf8(), good)

        # This example's from the wild:
        example = 'http://www.balset.com/DE3FJ4Yg/p:h=300&m=2011~07~25~2444705.png&ma=cb&or=1&w=400/2011/10/10/2923710.jpg'
        self.assertEqual(
            url.parse(example).unescape().escape().utf8(), example)

        # Examples with userinfo
        examples = [
            ('http://user%[email protected]/', 'http://*****:*****@foo.com/')
        ]
        for bad, good in examples:
            self.assertEqual(url.parse(bad).escape().utf8(), good)
            # Escaping should also be idempotent
            self.assertEqual(url.parse(bad).escape().escape().utf8(), good)
Exemple #58
0
 def test(first, second):
     # None of these examples should evaluate as strictly equal
     assert_not_equal(url.parse(first), url.parse(second),
         'URL(%s) should not equal URL(%s)' % (first, second))
     # Using a string
     assert_not_equal(url.parse(first), second,
         'URL(%s) should not equal %s' % (first, second))
     # Symmetric
     assert_not_equal(url.parse(second), url.parse(first),
         'URL(%s) should not equal URL(%s)' % (second, first))
     # Using a string, symmetric
     assert_not_equal(url.parse(second), first,
         'URL(%s) should not equal %s' % (second, first))
     # Should equal self
     assert_equal(url.parse(first), first,
         'URL(%s) should equal itself' % first)
     assert_equal(url.parse(second), second,
         'URL(%s) should equal itself' % second)