def test_unparse_parse(self): for u in [ 'Python', './Python', 'x-newscheme://foo.com/stuff', 'x://y', 'x:/y', 'x:/', '/', ]: self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
def checkRoundtrips(self, url, parsed, split): result = urlparse.urlparse(url) self.assertEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) self.assertEqual(t, parsed) # put it back together and it should be the same result2 = urlparse.urlunparse(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: result3 = urlparse.urlparse(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) self.assertEqual(result3.path, result.path) self.assertEqual(result3.params, result.params) self.assertEqual(result3.query, result.query) self.assertEqual(result3.fragment, result.fragment) self.assertEqual(result3.username, result.username) self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well result = urlparse.urlsplit(url) self.assertEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) self.assertEqual(t, split) result2 = urlparse.urlunsplit(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() result3 = urlparse.urlsplit(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) self.assertEqual(result3.path, result.path) self.assertEqual(result3.query, result.query) self.assertEqual(result3.fragment, result.fragment) self.assertEqual(result3.username, result.username) self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port)
def __init__(self, url, check_encoding=False): if isinstance(url, unicode): self.url = url.encode("utf-8") else: self.url = url if check_encoding: try: self.url.decode('ascii') except UnicodeDecodeError: p = urlparse.urlsplit(self.url) # TODO: check the rightfulness of this! self.url = urlparse.urlunsplit(( p[0], p[1], urllib.quote(p[2], safe="/"), urllib.quote(p[3], safe="&?="), urllib.quote(p[4]) ))
def __init__(self, url, check_encoding=False): if isinstance(url, py2_unicode): self.url = url.encode("utf-8") else: self.url = url if check_encoding: try: self.url.decode('ascii') except UnicodeDecodeError: p = urlparse.urlsplit(self.url) # TODO: check the rightfulness of this! self.url = urlparse.urlunsplit(( p[0], p[1], urllib.quote(p[2], safe=b"/"), urllib.quote(p[3], safe=b"&?="), urllib.quote(p[4]) ))
def __getattr__(self, attr): # pylint: disable=redefined-variable-type if attr == "parsed": # try: value = urlparse.urlsplit(self.url) # except ValueError: # value = urlparse.urlsplit("about:blank") elif attr == "tldextracted": value = tld_extract(self.parsed.netloc) # value = _tldextractor(self.url) elif attr == "normalized": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else "/", self.parsed.query, "" )).lstrip("/") if value.count("/") == 1: value = value.strip("/") elif attr == "normalized_without_query": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else "/", "", "" )).lstrip("/") if value.count("/") == 1: value = value.strip("/") elif attr == "homepage": value = urlparse.urlunsplit(( self.parsed.scheme, self.domain, "/", "", "" )).strip("/") # Pay-level domain elif attr == "pld": value = "%s.%s" % (self.tldextracted[1], self.tldextracted[2]) elif attr == "domain": value = self.parsed.netloc elif attr == "subdomain": value = self.tldextracted[0] elif attr == "normalized_domain": value = self.domain.strip(".") while value.startswith("www."): value = value[4:] if value.endswith(':80'): value = value[:-3] elif value.endswith(':443'): value = value[:-4] value = value.strip(".") elif attr == "normalized_subdomain": value = self.subdomain.strip(".") if value == "www": value = "" else: while value.startswith("www."): value = value[4:] elif attr == "normalized_path": if self.parsed.path == "/": return "" return self.parsed.path # https://en.wikipedia.org/wiki/Public_Suffix_List # Returns the domain name suffix ("co.uk" for "bbc.co.uk") elif attr == "suffix": value = self.tldextracted[2] else: raise Exception("Unknown attribute %s !" % attr) self.__dict__[attr] = value return value
def test_unparse_parse(self): for u in ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]: self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
def __getattr__(self, attr): # pylint: disable=redefined-variable-type if attr == "parsed": # try: value = urlparse.urlsplit(self.url) # except ValueError: # value = urlparse.urlsplit("about:blank") elif attr == "tldextracted": value = tld_extract(self.parsed.netloc) # value = _tldextractor(self.url) elif attr == "normalized": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else b"/", self.parsed.query, b"" )).lstrip(b"/") if value.count(b"/") == 1: value = value.strip(b"/") elif attr == "normalized_without_query": value = urlparse.urlunsplit(( None, self.normalized_domain, self.parsed.path if self.parsed.path else b"/", b"", b"" )).lstrip(b"/") if value.count(b"/") == 1: value = value.strip(b"/") elif attr == "homepage": value = urlparse.urlunsplit(( self.parsed.scheme, self.domain, b"/", b"", b"" )).strip(b"/") # Pay-level domain elif attr == "pld": value = b"%s.%s" % (self.tldextracted[1], self.tldextracted[2]) elif attr == "domain": value = self.parsed.netloc elif attr == "subdomain": value = self.tldextracted[0] elif attr == "normalized_domain": value = self.domain.strip(b".") while value.startswith(b"www."): value = value[4:] if value.endswith(b':80'): value = value[:-3] elif value.endswith(b':443'): value = value[:-4] value = value.strip(b".") elif attr == "normalized_subdomain": value = self.subdomain.strip(b".") if value == b"www": value = b"" else: while value.startswith(b"www."): value = value[4:] elif attr == "normalized_path": if self.parsed.path == b"/": return b"" return self.parsed.path # https://en.wikipedia.org/wiki/Public_Suffix_List # Returns the domain name suffix ("co.uk" for "bbc.co.uk") elif attr == "suffix": value = self.tldextracted[2] else: raise Exception("Unknown attribute %s !" % attr) self.__dict__[attr] = value return value