Beispiel #1
0
def test_options():
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse('http://example.com/foo?X=Y')).getURLString(
        ) == 'http://example.com/foo?x=y'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse('http://example.com/foo?X=Y'),
        query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse('http://example.com/foo?X=Y')).getURLString(
        ) == 'http://example.com/foo?x=y'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse('http://example.com/foo?X=Y'),
        query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
Beispiel #2
0
def normalize_url(url: typ.URL) -> typ.NormalizedURL:
    normalized_url = (
        canonicalize(handyurl.parse(url)).geturl()
        .replace('https://', 'http://')
        .replace('sftp://', 'ftp://')
    )
    return normalized_url
Beispiel #3
0
def test_DefaultIAURLCanonicalizer():
    # These tests are from DefaultIAURLCanonicalizerTest.java
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() == 'http://alexa.com/'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() == 'http://archive.org/index.html'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString() == 'http://archive.org/index.html'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString() == 'http://archive.org/index.html?a=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
Beispiel #4
0
def test_IAURLCanonicalizer():
    # These tests are from IAURLCanonicalizerTest.java
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() == 'http://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() == 'http://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() == 'https://archive.org:80/'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() == 'http://archive.org:443/'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() == 'https://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() == 'http://archive.org/big'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() == 'dns:www.archive.org'
Beispiel #5
0
def getCanonicalUrl(URL):

    netloc = ''
    path = ''
    params = ''
    query = ''
    fragment = ''

    URL = URL.strip()
    if (len(URL) > 0):

        canonicalURL = handyurl.parse(URL)
        canonicalURL = canonicalize(canonicalURL).getURLString()

        scheme, netloc, path, params, query, fragment = urllib.parse.urlparse(
            canonicalURL)

    returnValue = netloc + path + params + query + fragment

    #normalize url
    if (returnValue[-1] == '/'):
        returnValue = returnValue[:-1]

    return returnValue
Beispiel #6
0
def test_DefaultIAURLCanonicalizer():
    # These tests are from DefaultIAURLCanonicalizerTest.java
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://www.alexa.com/")).getURLString() == 'http://alexa.com/'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://archive.org/index.html")).getURLString(
        ) == 'http://archive.org/index.html'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://archive.org/index.html?")).getURLString(
        ) == 'http://archive.org/index.html'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://archive.org/index.html?a=b")).getURLString(
        ) == 'http://archive.org/index.html?a=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString(
        ) == 'http://archive.org/index.html?a=b&b=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")
    ).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(
        handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")
    ).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
Beispiel #7
0
def test_IAURLCanonicalizer():
    # These tests are from IAURLCanonicalizerTest.java
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://ARCHIVE.ORG/")).getURLString() == 'http://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.archive.org:80/")).getURLString(
        ) == 'http://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse("https://www.archive.org:80/")).getURLString(
        ) == 'https://archive.org:80/'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.archive.org:443/")).getURLString(
        ) == 'http://archive.org:443/'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse("https://www.archive.org:443/")).getURLString(
        ) == 'https://archive.org/'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.archive.org/big/")).getURLString(
        ) == 'http://archive.org/big'
    assert surt.IAURLCanonicalizer.canonicalize(
        handyurl.parse(
            "dns:www.archive.org")).getURLString() == 'dns:www.archive.org'
Beispiel #8
0
def test_GoogleURLCanonicalizer():
    # The tests are copied from GoogleURLCanonicalizerTest.java
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://host/%25%32%35")).getURLString() == 'http://host/%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString(
        ) == 'http://host/%25%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host/%2525252525252525")).getURLString(
        ) == 'http://host/%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host/asdf%25%32%35asd")).getURLString(
        ) == 'http://host/asdf%25asd'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString(
        ) == 'http://host/%25%25%25asd%25%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/")).getURLString(
        ) == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/"
        )).getURLString() == 'http://168.188.99.26/.secure/www.ebay.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/"
        )
    ).getURLString(
    ) == 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B"
        )).getURLString(
        ) == 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://3279880203/blah")).getURLString(
        ) == 'http://195.127.0.11/blah'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/blah/..")).getURLString(
        ) == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "www.google.com/")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "www.google.com")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.evil.com/blah#frag")).getURLString(
        ) == 'http://www.evil.com/blah'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.GOOgle.com/")).getURLString(
        ) == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com.../")).getURLString(
        ) == 'http://www.google.com/'

    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString(
        ) == 'http://www.google.com/foobarbaz2'

    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/q?")).getURLString(
        ) == 'http://www.google.com/q?'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/q?r?")).getURLString(
        ) == 'http://www.google.com/q?r?'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.google.com/q?r?s")).getURLString(
        ) == 'http://www.google.com/q?r?s'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://evil.com/foo#bar#baz")).getURLString(
        ) == 'http://evil.com/foo'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(
            "http://evil.com/foo;")).getURLString() == 'http://evil.com/foo;'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://evil.com/foo?bar;")).getURLString(
        ) == 'http://evil.com/foo?bar;'

    #This test case differs from the Java version. The Java version returns
    #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
    #is not possible, the python version encodes unicode domains as utf-8 before
    #percent encoding, so we get 'http://%01%C2%80.com/'
    # assert print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(u"http://\u0001\u0080.com/")).getURLString(
        ) == 'http://%01%C2%80.com/'

    #Add these unicode tests:
    # assert print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/
    # assert print(canonicalize(handyurl.parse('☃.com')).getURLString()) == http://xn--n3h.com/
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse(u'B\xfccher.ch:8080')).getURLString(
        ) == 'http://xn--bcher-kva.ch:8080/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse('☃.com')).getURLString() == 'http://xn--n3h.com/'

    #Add these percent-encoded unicode tests
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString(
        ) == 'http://www.t%EF%BF%BD%04.82.net/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://notrailingslash.com")).getURLString(
        ) == 'http://notrailingslash.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://www.gotaport.com:1234/")).getURLString(
        ) == 'http://www.gotaport.com:1234/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("  http://www.google.com/  ")).getURLString(
        ) == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http:// leadingspace.com/")).getURLString(
        ) == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://%20leadingspace.com/")).getURLString(
        ) == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("%20leadingspace.com/")).getURLString(
        ) == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("https://www.securesite.com/")).getURLString(
        ) == 'https://www.securesite.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host.com/ab%23cd")).getURLString(
        ) == 'http://host.com/ab%23cd'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("http://host.com//twoslashes?more//slashes")
    ).getURLString() == 'http://host.com/twoslashes?more//slashes'
    assert surt.GoogleURLCanonicalizer.canonicalize(
        handyurl.parse("mailto:[email protected]")).getURLString(
        ) == 'mailto:[email protected]'
Beispiel #9
0
def test_handyurl_parse():
    # These tests come from URLParserTest.java
    assert handyurl.parse("http://www.archive.org/index.html#foo").geturl(
    ) == 'http://www.archive.org/index.html#foo'
    assert handyurl.parse(
        "http://www.archive.org/").geturl() == 'http://www.archive.org/'
    assert handyurl.parse(
        "http://www.archive.org").geturl() == 'http://www.archive.org'
    assert handyurl.parse(
        "http://www.archive.org?").geturl() == 'http://www.archive.org?'
    assert handyurl.parse(
        "http://www.archive.org:8080/index.html?query#foo").geturl(
        ) == 'http://www.archive.org:8080/index.html?query#foo'
    assert handyurl.parse(
        "http://www.archive.org:8080/index.html?#foo").geturl(
        ) == 'http://www.archive.org:8080/index.html#foo'
    assert handyurl.parse("http://www.archive.org:8080?#foo").geturl(
    ) == 'http://www.archive.org:8080/#foo'
    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl(
    ) == u'http://bücher.ch:8080/#foo'
    assert handyurl.parse(u"dns:bücher.ch").geturl() == u'dns:bücher.ch'
    # XXX assert print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) == http://b\xfccher.ch:8080/#foo
    # XXX assert print(handyurl.parse(u"dns:bücher.ch").geturl()) == dns:b\xfccher.ch
    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl(
    ) == u"http://b\xfccher.ch:8080/#foo"
    assert handyurl.parse(u"dns:bücher.ch").geturl() == u"dns:b\xfccher.ch"

    ###From Tymm:
    assert handyurl.parse("http:////////////////www.vikings.com").geturl(
    ) == 'http://www.vikings.com/'
    assert handyurl.parse(
        "http://*****:*****@archive.org").scheme == b'mailto'
    assert handyurl.parse(
        "mailto:[email protected]").geturl() == 'mailto:[email protected]'
Beispiel #10
0
def test_handyurl_parse():
    # These tests come from URLParserTest.java
    assert handyurl.parse("http://www.archive.org/index.html#foo").geturl() == 'http://www.archive.org/index.html#foo'
    assert handyurl.parse("http://www.archive.org/").geturl() == 'http://www.archive.org/'
    assert handyurl.parse("http://www.archive.org").geturl() == 'http://www.archive.org'
    assert handyurl.parse("http://www.archive.org?").geturl() == 'http://www.archive.org?' 
    assert handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl() == 'http://www.archive.org:8080/index.html?query#foo'
    assert handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl() == 'http://www.archive.org:8080/index.html#foo'
    assert handyurl.parse("http://www.archive.org:8080?#foo").geturl() == 'http://www.archive.org:8080/#foo'
    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u'http://bücher.ch:8080/#foo'
    assert handyurl.parse(u"dns:bücher.ch").geturl() == u'dns:bücher.ch'
    # XXX assert print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) == http://b\xfccher.ch:8080/#foo 
    # XXX assert print(handyurl.parse(u"dns:bücher.ch").geturl()) == dns:b\xfccher.ch
    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u"http://b\xfccher.ch:8080/#foo"
    assert handyurl.parse(u"dns:bücher.ch").geturl() == u"dns:b\xfccher.ch"

    ###From Tymm:
    assert handyurl.parse("http:////////////////www.vikings.com").geturl() == 'http://www.vikings.com/'
    assert handyurl.parse("http://*****:*****@archive.org").scheme == 'mailto'
    assert handyurl.parse("mailto:[email protected]").geturl() == 'mailto:[email protected]'
Beispiel #11
0
def test_GoogleURLCanonicalizer():
    # The tests are copied from GoogleURLCanonicalizerTest.java
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() == 'http://host/%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() == 'http://host/%25%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString() == 'http://host/%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString() == 'http://host/asdf%25asd'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString() == 'http://host/%25%25%25asd%25%25'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString() == 'http://168.188.99.26/.secure/www.ebay.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString() == 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString() == 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://3279880203/blah")).getURLString() == 'http://195.127.0.11/blah'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com/")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString() == 'http://www.evil.com/blah'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com.../")).getURLString() == 'http://www.google.com/'

    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString() == 'http://www.google.com/foobarbaz2'

    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString() == 'http://www.google.com/q?'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString() == 'http://www.google.com/q?r?'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString() == 'http://www.google.com/q?r?s'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString() == 'http://evil.com/foo'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString() == 'http://evil.com/foo;'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString() == 'http://evil.com/foo?bar;'

    #This test case differs from the Java version. The Java version returns
    #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
    #is not possible, the python version encodes unicode domains as utf-8 before
    #percent encoding, so we get 'http://%01%C2%80.com/'
    # assert print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString() == 'http://%01%C2%80.com/'

    #Add these unicode tests:
    # assert print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/
    # assert print(canonicalize(handyurl.parse('☃.com')).getURLString()) == http://xn--n3h.com/
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString() == 'http://xn--bcher-kva.ch:8080/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse('☃.com')).getURLString() == 'http://xn--n3h.com/'

    #Add these percent-encoded unicode tests
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString() == 'http://www.t%EF%BF%BD%04.82.net/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString() == 'http://notrailingslash.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString() == 'http://www.gotaport.com:1234/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("  http://www.google.com/  ")).getURLString() == 'http://www.google.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString() == 'https://www.securesite.com/'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString() == 'http://host.com/ab%23cd'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() == 'http://host.com/twoslashes?more//slashes'
    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("mailto:[email protected]")).getURLString() == 'mailto:[email protected]'
Beispiel #12
0
def test_options():
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'