def test_strict_escape(self): '''Test strict mode escaping''' examples = [ ('danny%27s pub' , 'danny%27s%20pub' ), ('http://*****:*****@foo.com' , 'http://*****:*****@foo.com' ), (u'http://José:no [email protected]' , 'http://Jos%C3%A9:no%[email protected]'), ('http://oops!:don%[email protected]' , 'http://oops!:don%[email protected]' ), (u'española,nm%2cusa.html?gunk=junk+glunk&foo=bar baz', 'espa%C3%B1ola,nm%2Cusa.html?gunk=junk+glunk&foo=bar%20baz') ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good) # Escaping should also be idempotent self.assertEqual( url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good) # Examples with userinfo examples = [ ('http://user%[email protected]/', 'http://user%[email protected]/') ] for bad, good in examples: self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good) # Escaping should also be idempotent self.assertEqual( url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good) # Test Unicode escaping in strict mode u = url.URL(u'http', u'foo.com', None, u'española,nm%2cusa.html', u'', u'gunk=junk+glunk&foo=bar baz', u'') u.escape(strict=True) self.assertTrue(isinstance(u._path, str)) self.assertEqual(u._path, 'espa%C3%B1ola,nm%2Cusa.html')
def test(rules, example, pld, tld): try: url.set_psl(rules) assert_equal(url.parse(example).pld, pld) assert_equal(url.parse(example).tld, tld) finally: url.set_psl(pkgutil.get_data('url', 'psl/2016-08-16.psl'))
def __init__(self, target, **params): if isinstance(target, basestring): self._host = url.parse(target) elif isinstance(target, (tuple, list)): self._host = url.parse('http://%s:%s/' % target) else: raise TypeError('Host must be a string or tuple') self._params = params
def test(uni, puny): assert_equal(url.parse(uni).escape().punycode().utf8(), puny) # Also make sure punycode is idempotent assert_equal( url.parse(uni).escape().punycode().punycode().utf8(), puny) # Make sure that we can reverse the procedure correctly assert_equal( url.parse(uni).escape().punycode().unpunycode().unescape(), uni) # And we get what we'd expect going the opposite direction assert_equal(url.parse(puny).unescape().unpunycode().unicode(), uni)
def test_str_repr(self): '''Make sure str and repr produce reasonable results''' examples = [ ('http://foo.com/', 'http://foo.com/'), ('http://FOO.com/', 'http://foo.com/') ] for toparse, strng in examples: self.assertEqual(str(url.parse(toparse)), strng) self.assertEqual(repr(url.parse(toparse)), '<url.URL object "%s" >' % strng)
def test(example): assert_equal(url.parse(example).escape().punycode().unicode, example) # Also make sure punycode is idempotent assert_equal( url.parse(example).escape().punycode().punycode().unicode, example) # Make sure that we can reverse the procedure correctly assert_equal( url.parse(example).escape().punycode().unpunycode().unescape(), example) # And we get what we'd expect going the opposite direction assert_equal( url.parse(example).unescape().unpunycode().unicode, example)
def test_empty_hostname(self): '''Allow empty hostnames''' examples = [ 'http:///path', 'http://userinfo@/path', 'http://:80/path', ] for example in examples: # Equal to itself self.assertEqual(url.parse(example), example) # String representation equal to the provided example self.assertEqual(url.parse(example).utf8(), example)
def test(uni, puny, upuny, epuny): assert_equal(url.parse(uni).escape().punycode().utf8(), epuny) # Also make sure punycode is idempotent assert_equal( url.parse(uni).escape().punycode().punycode().utf8(), epuny) # Make sure that we can reverse the procedure correctly assert_equal( url.parse(uni).escape().punycode().unpunycode().unescape(), uni) # And we get what we'd expect going the opposite direction assert_equal( url.parse(puny).unescape().unpunycode().unicode(), uni)
def _initialize_crawl(self): if self.crawl_requires_gevent: from gevent.monkey import saved if 'socket' not in saved: # we're not gevent-monkey-patched raise RuntimeError( "Spider.crawl() needs gevent monkey patching to have been applied" ) self._resume_queue() if len( list( self._scraper.cache_storage._conn.execute( "SELECT * FROM seen LIMIT 1")) ) == 0 and self._queue.empty(): # we're at the beginning, so start with the home page # follow any homepage redirects, so we get the right protocol and domain tmp_response = requests.get("http://%s/" % self.domain) first_url = moz_url.parse(tmp_response.url) if first_url._host not in self._allowed_hosts: self._allowed_hosts.add(first_url._host) self._add_to_queue(first_url)
def _response_to_features(response): features = set() tree = etree.HTML(response.text) for item in tree.iter(tag=etree.Element): features.add("tag-%s" % item.tag) if "class" in item.attrib and item.attrib["class"].strip(): classes = whitespace.split(item.attrib["class"]) for _c in classes: c = _c.strip() if c: features.add("class-%s" % c) if "id" in item.attrib: features.add("id-%s" % item.attrib["id"]) # path parts u = moz_url.parse(response.url) path = u._path.split("/")[1:] for idx, part in enumerate(path): features.add("path-%s-%s" % (idx, path)) if u._query: for k, vl in urlparse.parse_qs(u._query).iteritems(): features.add("qse-%s" % k) for v in vl: features.add("qsv-%s-%s" % (k, v)) return features
def canonical_url(uri): """ Return the canonical representation of a given URI. This assumes the `uri` has a scheme. * When a default port corresponding for the scheme is explicitly declared (such as port 80 for http), the port will be removed from the output. * Fragments '#' are not removed. * Params and query string arguments are not reordered. """ try: parsed = urlpy.parse(uri) if not parsed: return if not (getattr(parsed, '_scheme', None) and getattr(parsed, '_host', None)): return if TRACE: logger_debug('canonical_url: parsed:', parsed) sanitized = parsed.sanitize() if TRACE: logger_debug('canonical_url: sanitized:', sanitized) punycoded = sanitized.punycode() if TRACE: logger_debug('canonical_url: punycoded:', punycoded) if punycoded._port == urlpy.PORTS.get(punycoded._scheme, None): punycoded._port = None return punycoded.utf8() except Exception as e: if TRACE: logger_debug('canonical_url: failed for:', uri, 'with:', repr(e)) # ignore it pass
def _response_to_features(response): features = set() tree = etree.HTML(response.text) for item in tree.iter(tag=etree.Element): features.add("tag-%s" % item.tag) if 'class' in item.attrib and item.attrib['class'].strip(): classes = whitespace.split(item.attrib['class']) for _c in classes: c = _c.strip() if c: features.add("class-%s" % c) if 'id' in item.attrib: features.add("id-%s" % item.attrib['id']) # path parts u = moz_url.parse(response.url) path = u._path.split("/")[1:] for idx, part in enumerate(path): features.add('path-%s-%s' % (idx, path)) if u._query: for k, vl in urlparse.parse_qs(u._query).iteritems(): features.add('qse-%s' % k) for v in vl: features.add('qsv-%s-%s' % (k, v)) return features
def test_abspath(self): '''Make sure absolute path checking works correctly''' examples = [ ('howdy' , 'howdy' ), ('hello//how//are' , 'hello/how/are'), ('hello/../how/are', 'how/are' ), ('hello//..//how/' , 'how/' ), ('a/b/../../c' , 'c' ), ('../../../c' , 'c' ), ('./hello' , 'hello' ), ('./././hello' , 'hello' ), ('a/b/c/' , 'a/b/c/' ), ('a/b/c/..' , 'a/b/' ), ('a/b/.' , 'a/b/' ), ('a/b/./././' , 'a/b/' ), ('a/b/../' , 'a/' ), ('.' , '' ), ('../../..' , '' ), ('////foo' , 'foo' ) ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).abspath().utf8(), good)
def filter_links(self, links): filteredLinks = [] for link in links: # if link is a directory, then follow it # TODO: "?dir=" is provided to properly recognize as dirs the ones which # are specified as queries (it is an in-place fix and should be removed) if link.url.endswith("/") or link.url.find("?dir=")>=0: filteredLinks.append(link) # if not, verify whether it is a video file: if it is then save it, otherwise skip else: if isVideoURL(link.url): # normalize the URL # normLinkURL = link.url normLinkURL = url.parse(link.url).canonical().escape().punycode().utf8() # save the url... but only if it has not been indexed yet # check if the URL exists in redis if not self.r.exists(normLinkURL): # if not, add it to the toIndex queue # (NOTE: it might be already present in toIndex, but we don't mind as it is a set) self._logger.info("sadd %s %s " % (self._conf['key_toIndex'],normLinkURL)) self.r.sadd(self._conf['key_toIndex'], normLinkURL) return filteredLinks
def save(url_, path="", wait=60): if hasattr(url_, "url"): url_ = url_.url if len(path) < 5 or "." not in path[-5:-3]: file = url.parse(str(url_)).filename path = os.path.join(path, file) open(path, "w").write(download(url_, wait)) return path
def test_deuserinfo(self): '''Correctly removes userinfo''' examples = [ ('http://*****:*****@foo.com/', 'http://foo.com/'), ('http://[email protected]/', 'http://foo.com/') ] for bad, good in examples: self.assertEqual(url.parse(bad).deuserinfo().utf8(), good)
def test_tld(self): '''Test the pay-level domain functionality''' examples = [ ('http://foo.com/bar' , 'com'), ('http://bar.foo.com/bar', 'com'), ('/foo' , '') ] for query, result in examples: self.assertEqual(url.parse(query).tld(), result)
def handle(request): # URL Parsing s = "api." if 'project' in request.match_info: p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['project']) s += p + "." if 'module' in request.match_info: p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['module']) s += p + "." if 'action' in request.match_info: p = re.sub('[^0-9a-zA-Z]+', '', request.match_info['action']) s += p if verbose_logging: print("Incoming request - " + str(request.raw_path)) # Session loading if "cookie" in request.headers and "AIOHTTP_SESSION" in request.headers[ "cookie"]: cookies = request.headers.getall("cookie")[0] aiocookie = re.search("AIOHTTP_SESSION=([0-9a-z]{32})", cookies) if aiocookie: session = sessions.Session(aiocookie.group(1)) if verbose_logging: print("Session ID - " + session.id) else: session = sessions.Session() if verbose_logging: print("New Session with ID - " + session.id) else: session = sessions.Session() if verbose_logging: print("New Session with ID - " + session.id) # Response building try: module = importlib.import_module(s) arguments = url.parse(request.GET, module.arguments) response = yield from module.process(session, arguments) headers = response.get('headers', {}) if session.is_new_session: headers.update({"Set-Cookie": "AIOHTTP_SESSION=" + session.id}) if 'json' in response: # Dump to json if the module wants to return json return respond(headers=headers, status=response.get('status', 200), text=json.dumps(response.get("json", "")), content_type="application/json") else: return respond(headers=headers, status=response.get('status', 200), text=response.get("text", "")) except ImportError: return respond(status=404, text="Page does not exist") except Exception as e: s = str(e.args) if verbose_errors else "No verbose errors." return respond(status=500, text="Error while querying data.\n" + s)
def test(first, second): # Equiv with another URL object assert url.parse(first).equiv(url.parse(second)) # Equiv with a string assert url.parse(first).equiv(second) # Make sure it's also symmetric assert url.parse(second).equiv(url.parse(first)) # Symmetric with string arg assert url.parse(second).equiv(first) # Should be equivalent to self assert url.parse(first).equiv(first) assert url.parse(second).equiv(second)
def test_absolute(self): '''Can it recognize if it's a relative or absolute url?''' examples = [ ('http://foo.com/bar', True ), ('foo/' , False), ('http://foo.com' , True ), ('/foo/bar/../' , False) ] for query, result in examples: self.assertEqual(url.parse(query).absolute(), result)
def test_escape(self): '''Make sure we escape paths correctly''' examples = [ ('hello%20and%20how%20are%20you', 'hello%20and%20how%20are%20you'), ('danny\'s pub' , 'danny%27s%20pub' ), ('danny%27s pub?foo=bar&yo' , 'danny%27s%20pub?foo=bar&yo' ), # Thanks to @myronmarston for these test cases ('foo?bar none=foo bar' , 'foo?bar%20none=foo%20bar' ), ('foo;a=1;b=2?a=1&b=2' , 'foo;a=1;b=2?a=1&b=2' ), ('foo?bar=["hello","howdy"]' , 'foo?bar=%5B%22hello%22,%22howdy%22%5D'), ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).escape().utf8(), good) # Escaping should also be idempotent self.assertEqual(url.parse(bad).escape().escape().utf8(), good)
def test_lower(self): '''Can lowercase the domain name correctly''' examples = [ ('www.TESTING.coM' , 'www.testing.com/' ), ('WWW.testing.com' , 'www.testing.com/' ), ('WWW.testing.com/FOO', 'www.testing.com/FOO') ] for bad, good in examples: bad = 'http://' + bad good = 'http://' + good self.assertEqual(url.parse(bad).utf8(), good)
def test_userinfo(self): '''Allow a userinfo section''' examples = [ ('http://*****:*****@foo.com', 'http://*****:*****@foo.com'), ('http://[email protected]', 'http://[email protected]') ] suffix = '/page.html' for bad, good in examples: bad = bad + suffix good = good + suffix self.assertEqual(url.parse(bad).utf8(), good)
def test_defrag(self): '''Correctly defrags urls''' examples = [ ('foo#bar', 'foo') ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).defrag().utf8(), good)
def test_sanitize(self): '''Make sure the sanitize method does all that it should''' examples = [ ('../foo/bar none', 'foo/bar%20none') ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).sanitize().utf8(), good)
def url_host_domain(url): """ Return a tuple of the (host, domain) of a URL or None. Assumes that the URL has a scheme. """ parsed = urlpy.parse(url) host = parsed._host if not host: return None, None host = host.lower() domain = parsed.pld().lower() return host, domain
def test_canonical(self): '''Correctly canonicalizes urls''' examples = [ ('?b=2&a=1&c=3', '?a=1&b=2&c=3'), (';b=2;a=1;c=3', ';a=1;b=2;c=3') ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).canonical().utf8(), good)
def test_punycode(self): '''Make sure punycode encoding works correctly''' examples = [ (u'http://www.kündigen.de/', 'http://www.xn--kndigen-n2a.de/'), (u'http://россия.иком.museum/', 'http://xn--h1alffa9f.xn--h1aegh.museum/'), (u'http://россия.иком.museum/испытание.html', 'http://xn--h1alffa9f.xn--h1aegh.museum/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.html') ] for uni, puny in examples: self.assertEqual(url.parse(uni).escape().punycode().utf8(), puny) # Also make sure punycode is idempotent self.assertEqual( url.parse(uni).escape().punycode().punycode().utf8(), puny) # Make sure that we can reverse the procedure correctly self.assertEqual( url.parse(uni).escape().punycode().unpunycode().unescape(), uni) # And we get what we'd expect going the opposite direction self.assertEqual( url.parse(puny).unescape().unpunycode().unicode(), uni) # Make sure that we can't punycode or unpunycode relative urls examples = ['foo', '../foo', '/bar/foo'] for relative in examples: self.assertRaises(TypeError, url.parse(relative).punycode) self.assertRaises(TypeError, url.parse(relative).unpunycode)
def normalize(url): """ Uses the Moz URL library to normalise and strip the URLs of extraneous information, and the urlparse library to ensure it is not a blank URL. """ if url[:4] != 'http': url = 'http://'+url url = url.lower() url_parts = urlparse(url) if url_parts.netloc: url_obj = parse(url).defrag().abspath().canonical().punycode() return url_obj.utf8()
def clean_url(u): u = url.parse(u) u.deparam(utm) u.strip() u.canonical() u.abspath() u.unescape() https_param = get_http_param(u.query) if len(https_param)==1: u = https_param.pop() return clean_url(u) u = str(u) return u
def normalize(url): """ Uses the Moz URL library to normalise and strip the URLs of extraneous information, and the urlparse library to ensure it is not a blank URL. """ if url[:4] != 'http': url = 'http://' + url url = url.lower() url_parts = urlparse(url) if url_parts.netloc: url_obj = parse(url).defrag().abspath().canonical().punycode() return url_obj.utf8()
def test(bad, good, ugood, egood): assert_equal(str(url.parse(bad).escape()), good) assert_equal(url.parse(bad).escape().utf8(), egood) assert_equal(url.parse(bad).escape().unicode(), ugood) # Escaping should also be idempotent assert_equal(str(url.parse(bad).escape().escape()), good) assert_equal(url.parse(bad).escape().escape().utf8(), egood) assert_equal(url.parse(bad).escape().escape().unicode(), ugood)
def test_strict_escape(self): '''Test strict mode escaping''' examples = [ ('danny%27s pub' , 'danny%27s%20pub' ), ('this%5Fand%5Fthat' , 'this_and_that' ), ('http://*****:*****@foo.com' , 'http://*****:*****@foo.com' ), (u'http://José:no [email protected]' , 'http://Jos%C3%A9:no%[email protected]'), ('http://oops!:don%[email protected]' , 'http://oops!:don%[email protected]' ), (u'española,nm%2cusa.html?gunk=junk+glunk&foo=bar baz', 'espa%C3%B1ola,nm%2Cusa.html?gunk=junk+glunk&foo=bar%20baz'), ('http://foo.com/bar\nbaz.html\n', 'http://foo.com/bar%0Abaz.html%0A'), ('http://foo.com/bar.jsp?param=\n/value%2F', 'http://foo.com/bar.jsp?param=%0A/value%2F'), ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good) # Escaping should also be idempotent self.assertEqual( url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good) # Examples with userinfo examples = [ ('http://user%[email protected]/', 'http://user%[email protected]/') ] for bad, good in examples: self.assertEqual(url.parse(bad).escape(strict=True).utf8(), good) # Escaping should also be idempotent self.assertEqual( url.parse(bad).escape(strict=True).escape(strict=True).utf8(), good) # Test Unicode escaping in strict mode u = url.URL(u'http', u'foo.com', None, u'española,nm%2cusa.html', u'', u'gunk=junk+glunk&foo=bar baz', u'') u.escape(strict=True) self.assertTrue(isinstance(u._path, str)) self.assertEqual(u._path, 'espa%C3%B1ola,nm%2Cusa.html')
def test_component_assignment_unicode(): parsed = url.parse('http://[email protected]:80/path;params?query#fragment') parsed.scheme = u'https' parsed.userinfo = u'username' parsed.host = u'foo.example.com' parsed.port = 443 parsed.path = u'/another/path' parsed.params = u'no-params' parsed.query = u'no-query' parsed.fragment = u'no-fragment' assert_equal( parsed.unicode, 'https://[email protected]:443/another/path;no-params?no-query#no-fragment' )
def canonical_url(uri): """ Return the canonical representation of a given URI. This assumes the `uri` has a scheme. * When a default port corresponding for the scheme is explicitly declared (such as port 80 for http), the port will be removed from the output. * Fragments '#' are not removed. * Params and query string arguments are not reordered. """ normalized = urlpy.parse(uri).sanitize().punycode() if normalized._port == urlpy.PORTS.get(normalized._scheme, None): normalized._port = None return normalized.utf8()
def test_relative(): def test(rel, absolute): assert_equal(base.relative(rel).utf8(), absolute) base = url.parse('http://testing.com/a/b/c') examples = [('../foo', 'http://testing.com/a/foo'), ('./foo', 'http://testing.com/a/b/foo'), ('foo', 'http://testing.com/a/b/foo'), ('/foo', 'http://testing.com/foo'), ('http://foo.com/bar', 'http://foo.com/bar'), (u'/foo', 'http://testing.com/foo')] for rel, absolute in examples: yield test, rel, absolute
def test_relative(self): '''Test relative url parsing''' base = url.parse('http://testing.com/a/b/c') examples = [ ('../foo' , 'http://testing.com/a/foo' ), ('./foo' , 'http://testing.com/a/b/foo'), ('foo' , 'http://testing.com/a/b/foo'), ('/foo' , 'http://testing.com/foo' ), ('http://foo.com/bar', 'http://foo.com/bar' ), (u'/foo' , 'http://testing.com/foo' ) ] for rel, absolute in examples: self.assertEqual(base.relative(rel).utf8(), absolute)
def test_escape(self): '''Make sure we escape paths correctly''' examples = [ ('hello%20and%20how%20are%20you', 'hello%20and%20how%20are%20you'), ('danny\'s pub' , 'danny\'s%20pub' ), ('danny%27s pub' , 'danny\'s%20pub' ), ('danny\'s pub?foo=bar&yo' , 'danny\'s%20pub?foo=bar&yo' ), ('hello%2c world' , 'hello,%20world' ), ('%3f%23%5b%5d' , '%3F%23%5B%5D' ), # Thanks to @myronmarston for these test cases ('foo?bar none=foo bar' , 'foo?bar%20none=foo%20bar' ), ('foo;a=1;b=2?a=1&b=2' , 'foo;a=1;b=2?a=1&b=2' ), ('foo?bar=["hello","howdy"]' , 'foo?bar=%5B%22hello%22,%22howdy%22%5D'), ] base = 'http://testing.com/' for bad, good in examples: bad = base + bad good = base + good self.assertEqual(url.parse(bad).escape().utf8(), good) # Escaping should also be idempotent self.assertEqual(url.parse(bad).escape().escape().utf8(), good) # This example's from the wild: example = 'http://www.balset.com/DE3FJ4Yg/p:h=300&m=2011~07~25~2444705.png&ma=cb&or=1&w=400/2011/10/10/2923710.jpg' self.assertEqual( url.parse(example).unescape().escape().utf8(), example) # Examples with userinfo examples = [ ('http://user%[email protected]/', 'http://*****:*****@foo.com/') ] for bad, good in examples: self.assertEqual(url.parse(bad).escape().utf8(), good) # Escaping should also be idempotent self.assertEqual(url.parse(bad).escape().escape().utf8(), good)
def test(first, second): # None of these examples should evaluate as strictly equal assert_not_equal(url.parse(first), url.parse(second), 'URL(%s) should not equal URL(%s)' % (first, second)) # Using a string assert_not_equal(url.parse(first), second, 'URL(%s) should not equal %s' % (first, second)) # Symmetric assert_not_equal(url.parse(second), url.parse(first), 'URL(%s) should not equal URL(%s)' % (second, first)) # Using a string, symmetric assert_not_equal(url.parse(second), first, 'URL(%s) should not equal %s' % (second, first)) # Should equal self assert_equal(url.parse(first), first, 'URL(%s) should equal itself' % first) assert_equal(url.parse(second), second, 'URL(%s) should equal itself' % second)