Beispiel #1
0
 def test_unparse_parse(self):
     for u in [
             'Python',
             './Python',
             'x-newscheme://foo.com/stuff',
             'x://y',
             'x:/y',
             'x:/',
             '/',
     ]:
         self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u)
         self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
    def checkRoundtrips(self, url, parsed, split):
        result = urlparse.urlparse(url)
        self.assertEqual(result, parsed)
        t = (result.scheme, result.netloc, result.path,
             result.params, result.query, result.fragment)
        self.assertEqual(t, parsed)
        # put it back together and it should be the same
        result2 = urlparse.urlunparse(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # the result of geturl() is a fixpoint; we can always parse it
        # again to get the same result:
        result3 = urlparse.urlparse(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3,          result)
        self.assertEqual(result3.scheme,   result.scheme)
        self.assertEqual(result3.netloc,   result.netloc)
        self.assertEqual(result3.path,     result.path)
        self.assertEqual(result3.params,   result.params)
        self.assertEqual(result3.query,    result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port,     result.port)

        # check the roundtrip using urlsplit() as well
        result = urlparse.urlsplit(url)
        self.assertEqual(result, split)
        t = (result.scheme, result.netloc, result.path,
             result.query, result.fragment)
        self.assertEqual(t, split)
        result2 = urlparse.urlunsplit(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # check the fixpoint property of re-parsing the result of geturl()
        result3 = urlparse.urlsplit(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3,          result)
        self.assertEqual(result3.scheme,   result.scheme)
        self.assertEqual(result3.netloc,   result.netloc)
        self.assertEqual(result3.path,     result.path)
        self.assertEqual(result3.query,    result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port,     result.port)
Beispiel #3
0
    def checkRoundtrips(self, url, parsed, split):
        result = urlparse.urlparse(url)
        self.assertEqual(result, parsed)
        t = (result.scheme, result.netloc, result.path, result.params,
             result.query, result.fragment)
        self.assertEqual(t, parsed)
        # put it back together and it should be the same
        result2 = urlparse.urlunparse(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # the result of geturl() is a fixpoint; we can always parse it
        # again to get the same result:
        result3 = urlparse.urlparse(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3, result)
        self.assertEqual(result3.scheme, result.scheme)
        self.assertEqual(result3.netloc, result.netloc)
        self.assertEqual(result3.path, result.path)
        self.assertEqual(result3.params, result.params)
        self.assertEqual(result3.query, result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port, result.port)

        # check the roundtrip using urlsplit() as well
        result = urlparse.urlsplit(url)
        self.assertEqual(result, split)
        t = (result.scheme, result.netloc, result.path, result.query,
             result.fragment)
        self.assertEqual(t, split)
        result2 = urlparse.urlunsplit(result)
        self.assertEqual(result2, url)
        self.assertEqual(result2, result.geturl())

        # check the fixpoint property of re-parsing the result of geturl()
        result3 = urlparse.urlsplit(result.geturl())
        self.assertEqual(result3.geturl(), result.geturl())
        self.assertEqual(result3, result)
        self.assertEqual(result3.scheme, result.scheme)
        self.assertEqual(result3.netloc, result.netloc)
        self.assertEqual(result3.path, result.path)
        self.assertEqual(result3.query, result.query)
        self.assertEqual(result3.fragment, result.fragment)
        self.assertEqual(result3.username, result.username)
        self.assertEqual(result3.password, result.password)
        self.assertEqual(result3.hostname, result.hostname)
        self.assertEqual(result3.port, result.port)
Beispiel #4
0
    def __init__(self, url, check_encoding=False):

        if isinstance(url, unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url

        if check_encoding:
            try:
                self.url.decode('ascii')
            except UnicodeDecodeError:
                p = urlparse.urlsplit(self.url)

                # TODO: check the rightfulness of this!
                self.url = urlparse.urlunsplit((
                    p[0],
                    p[1],
                    urllib.quote(p[2], safe="/"),
                    urllib.quote(p[3], safe="&?="),
                    urllib.quote(p[4])
                ))
Beispiel #5
0
    def __init__(self, url, check_encoding=False):

        if isinstance(url, py2_unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url

        if check_encoding:
            try:
                self.url.decode('ascii')
            except UnicodeDecodeError:
                p = urlparse.urlsplit(self.url)

                # TODO: check the rightfulness of this!
                self.url = urlparse.urlunsplit((
                    p[0],
                    p[1],
                    urllib.quote(p[2], safe=b"/"),
                    urllib.quote(p[3], safe=b"&?="),
                    urllib.quote(p[4])
                ))
Beispiel #6
0
    def __getattr__(self, attr):
        # pylint: disable=redefined-variable-type

        if attr == "parsed":
            # try:
            value = urlparse.urlsplit(self.url)
            # except ValueError:
            #     value = urlparse.urlsplit("about:blank")

        elif attr == "tldextracted":

            value = tld_extract(self.parsed.netloc)
            # value = _tldextractor(self.url)

        elif attr == "normalized":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else "/",
                self.parsed.query,
                ""
            )).lstrip("/")

            if value.count("/") == 1:
                value = value.strip("/")

        elif attr == "normalized_without_query":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else "/",
                "",
                ""
            )).lstrip("/")

            if value.count("/") == 1:
                value = value.strip("/")

        elif attr == "homepage":
            value = urlparse.urlunsplit((
                self.parsed.scheme,
                self.domain,
                "/",
                "",
                ""
            )).strip("/")

        # Pay-level domain
        elif attr == "pld":
            value = "%s.%s" % (self.tldextracted[1], self.tldextracted[2])

        elif attr == "domain":
            value = self.parsed.netloc

        elif attr == "subdomain":
            value = self.tldextracted[0]

        elif attr == "normalized_domain":

            value = self.domain.strip(".")

            while value.startswith("www."):
                value = value[4:]

            if value.endswith(':80'):
                value = value[:-3]
            elif value.endswith(':443'):
                value = value[:-4]

            value = value.strip(".")

        elif attr == "normalized_subdomain":

            value = self.subdomain.strip(".")

            if value == "www":
                value = ""
            else:
                while value.startswith("www."):
                    value = value[4:]

        elif attr == "normalized_path":
            if self.parsed.path == "/":
                return ""
            return self.parsed.path

        # https://en.wikipedia.org/wiki/Public_Suffix_List
        # Returns the domain name suffix ("co.uk" for "bbc.co.uk")
        elif attr == "suffix":
            value = self.tldextracted[2]

        else:
            raise Exception("Unknown attribute %s !" % attr)

        self.__dict__[attr] = value
        return value
 def test_unparse_parse(self):
     for u in ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]:
         self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u)
         self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
Beispiel #8
0
    def __getattr__(self, attr):
        # pylint: disable=redefined-variable-type

        if attr == "parsed":
            # try:
            value = urlparse.urlsplit(self.url)
            # except ValueError:
            #     value = urlparse.urlsplit("about:blank")

        elif attr == "tldextracted":

            value = tld_extract(self.parsed.netloc)
            # value = _tldextractor(self.url)

        elif attr == "normalized":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else b"/",
                self.parsed.query,
                b""
            )).lstrip(b"/")

            if value.count(b"/") == 1:
                value = value.strip(b"/")

        elif attr == "normalized_without_query":
            value = urlparse.urlunsplit((
                None,
                self.normalized_domain,
                self.parsed.path if self.parsed.path else b"/",
                b"",
                b""
            )).lstrip(b"/")

            if value.count(b"/") == 1:
                value = value.strip(b"/")

        elif attr == "homepage":
            value = urlparse.urlunsplit((
                self.parsed.scheme,
                self.domain,
                b"/",
                b"",
                b""
            )).strip(b"/")

        # Pay-level domain
        elif attr == "pld":
            value = b"%s.%s" % (self.tldextracted[1], self.tldextracted[2])

        elif attr == "domain":
            value = self.parsed.netloc

        elif attr == "subdomain":
            value = self.tldextracted[0]

        elif attr == "normalized_domain":

            value = self.domain.strip(b".")

            while value.startswith(b"www."):
                value = value[4:]

            if value.endswith(b':80'):
                value = value[:-3]
            elif value.endswith(b':443'):
                value = value[:-4]

            value = value.strip(b".")

        elif attr == "normalized_subdomain":

            value = self.subdomain.strip(b".")

            if value == b"www":
                value = b""
            else:
                while value.startswith(b"www."):
                    value = value[4:]

        elif attr == "normalized_path":
            if self.parsed.path == b"/":
                return b""
            return self.parsed.path

        # https://en.wikipedia.org/wiki/Public_Suffix_List
        # Returns the domain name suffix ("co.uk" for "bbc.co.uk")
        elif attr == "suffix":
            value = self.tldextracted[2]

        else:
            raise Exception("Unknown attribute %s !" % attr)

        self.__dict__[attr] = value
        return value