Python canonicalize_url Exemples, crawlmi.utils.url.canonicalize_url Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : request.py Projet : Mimino666/crawlmi

def request_fingerprint(request, include_headers=None):
    '''Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (i.e. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:

    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint.

    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.
    '''

    if include_headers:
        include_headers = tuple([h.lower() for h in sorted(include_headers)])
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(request.method)
        fp.update(canonicalize_url(request.url))
        fp.update(request.body or '')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]

Exemple #2

0

Afficher le fichier

Fichier : test_utils_url.py Projet : Mimino666/crawlmi

    def test_canonicalize_url(self):
        # simplest case
        self.assertEqual(canonicalize_url('http://www.example.com/'),
                                          'http://www.example.com/')

        # always return a str
        assert isinstance(canonicalize_url(u'http://www.example.com'), str)

        # append missing path
        self.assertEqual(canonicalize_url('http://www.example.com'),
                                          'http://www.example.com/')
        # typical usage
        self.assertEqual(canonicalize_url('http://www.example.com/do?a=1&b=2&c=3'),
                                          'http://www.example.com/do?a=1&b=2&c=3')
        self.assertEqual(canonicalize_url('http://www.example.com/do?c=1&b=2&a=3'),
                                          'http://www.example.com/do?a=3&b=2&c=1')
        self.assertEqual(canonicalize_url('http://www.example.com/do?&a=1'),
                                          'http://www.example.com/do?a=1')

        # sorting by argument values
        self.assertEqual(canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50'),
                                          'http://www.example.com/do?a=50&b=2&b=5&c=3')

        # using keep_blank_values
        self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2', keep_blank_values=False),
                                          'http://www.example.com/do?a=2')
        self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2'),
                                          'http://www.example.com/do?a=2&b=')
        self.assertEqual(canonicalize_url('http://www.example.com/do?b=&c&a=2', keep_blank_values=False),
                                          'http://www.example.com/do?a=2')
        self.assertEqual(canonicalize_url('http://www.example.com/do?b=&c&a=2'),
                                          'http://www.example.com/do?a=2&b=&c=')

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                                           'http://www.example.com/do?1750%2C4=')

        # spaces
        self.assertEqual(canonicalize_url('http://www.example.com/do?q=a space&a=1'),
                                          'http://www.example.com/do?a=1&q=a+space')
        self.assertEqual(canonicalize_url('http://www.example.com/do?q=a+space&a=1'),
                                          'http://www.example.com/do?a=1&q=a+space')
        self.assertEqual(canonicalize_url('http://www.example.com/do?q=a%20space&a=1'),
                                          'http://www.example.com/do?a=1&q=a+space')

        # normalize percent-encoding case (in paths)
        self.assertEqual(canonicalize_url('http://www.example.com/a%a3do'),
                                          'http://www.example.com/a%A3do'),
        # normalize percent-encoding case (in query arguments)
        self.assertEqual(canonicalize_url('http://www.example.com/do?k=b%a3'),
                                          'http://www.example.com/do?k=b%A3')

        # non-ASCII percent-encoding in paths
        self.assertEqual(canonicalize_url('http://www.example.com/a do?a=1'),
                                          'http://www.example.com/a%20do?a=1'),
        self.assertEqual(canonicalize_url('http://www.example.com/a %20do?a=1'),
                                          'http://www.example.com/a%20%20do?a=1'),
        self.assertEqual(canonicalize_url('http://www.example.com/a do\xc2\xa3.html?a=1'),
                                          'http://www.example.com/a%20do%C2%A3.html?a=1')
        # non-ASCII percent-encoding in query arguments
        self.assertEqual(canonicalize_url(u'http://www.example.com/do?price=\xa3500&a=5&z=3'),
                                          u'http://www.example.com/do?a=5&price=%C2%A3500&z=3')
        self.assertEqual(canonicalize_url('http://www.example.com/do?price=\xc2\xa3500&a=5&z=3'),
                                          'http://www.example.com/do?a=5&price=%C2%A3500&z=3')
        self.assertEqual(canonicalize_url('http://www.example.com/do?price(\xc2\xa3)=500&a=1'),
                                          'http://www.example.com/do?a=1&price%28%C2%A3%29=500')

        # urls containing auth and ports
        self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com:81/do?now=1'),
                                          u'http://*****:*****@www.example.com:81/do?now=1')

        # remove fragments
        self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag'),
                                          u'http://*****:*****@www.example.com/do?a=1')
        self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag', keep_fragments=True),
                                          u'http://*****:*****@www.example.com/do?a=1#frag')

        # dont convert safe characters to percent encoding representation
        self.assertEqual(canonicalize_url(
            'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html'),
            'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html')

        # relative paths in url
        self.assertEqual(canonicalize_url(
            'http://www.test.com/./a/./b/../c'),
            'http://www.test.com/a/c')

        # urllib.quote uses a mapping cache of encoded characters. when parsing
        # an already percent-encoded url, it will fail if that url was not
        # percent-encoded as utf-8, that's why canonicalize_url must always
        # convert the urls to string. the following test asserts that
        # functionality.
        self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
                                           'http://www.example.com/caf%E9-con-leche.htm')

        # domains are case insensitive
        self.assertEqual(canonicalize_url('http://www.EXAMPLE.com/'),
                                          'http://www.example.com/')

        # quoted slash and question sign
        self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC+rocks%3f/?yeah=1'),
                         'http://foo.com/AC%2FDC+rocks%3F/?yeah=1')
        self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC/'),
                         'http://foo.com/AC%2FDC/')

        # utm tags
        self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello&utm_source=gh&utm_medium=320banner&utm_campaign=bpp'),
            'http://gh.com/test?msg=hello')
        self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello#utm_source=gh&utm_medium=320banner&utm_campaign=bpp', keep_fragments=True),
            'http://gh.com/test?msg=hello')
        # when fragment is not query-like, keep utm
        self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello#a;b;utm_source=gh', keep_fragments=True),
            'http://gh.com/test?msg=hello#a;b;utm_source=gh')

        # stip www prefix
        self.assertEqual(canonicalize_url(
            'http://www.example.com/', strip_www=True),
            'http://example.com/')
        self.assertEqual(canonicalize_url(
            'http://example.com/', strip_www=True),
            'http://example.com/')
        self.assertEqual(canonicalize_url(
            'http://www2.example.com/', strip_www=True),
            'http://www2.example.com/')
        self.assertEqual(canonicalize_url(
            'http://www.www.example.com/', strip_www=True),
            'http://www.example.com/')
        self.assertEqual(canonicalize_url(
            'http://*****:*****@www.example.com/', strip_www=True),
            'http://*****:*****@example.com/')
        self.assertEqual(canonicalize_url(
            'http://*****:*****@example.com/', strip_www=True),
            'http://*****:*****@example.com/')

Exemple #3

0

Afficher le fichier

    def test_canonicalize_url(self):
        # simplest case
        self.assertEqual(canonicalize_url('http://www.example.com/'),
                         'http://www.example.com/')

        # always return a str
        assert isinstance(canonicalize_url(u'http://www.example.com'), str)

        # append missing path
        self.assertEqual(canonicalize_url('http://www.example.com'),
                         'http://www.example.com/')
        # typical usage
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?a=1&b=2&c=3'),
            'http://www.example.com/do?a=1&b=2&c=3')
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?c=1&b=2&a=3'),
            'http://www.example.com/do?a=3&b=2&c=1')
        self.assertEqual(canonicalize_url('http://www.example.com/do?&a=1'),
                         'http://www.example.com/do?a=1')

        # sorting by argument values
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50'),
            'http://www.example.com/do?a=50&b=2&b=5&c=3')

        # using keep_blank_values
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?b=&a=2',
                             keep_blank_values=False),
            'http://www.example.com/do?a=2')
        self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2'),
                         'http://www.example.com/do?a=2&b=')
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?b=&c&a=2',
                             keep_blank_values=False),
            'http://www.example.com/do?a=2')
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?b=&c&a=2'),
            'http://www.example.com/do?a=2&b=&c=')

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                         'http://www.example.com/do?1750%2C4=')

        # spaces
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?q=a space&a=1'),
            'http://www.example.com/do?a=1&q=a+space')
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?q=a+space&a=1'),
            'http://www.example.com/do?a=1&q=a+space')
        self.assertEqual(
            canonicalize_url('http://www.example.com/do?q=a%20space&a=1'),
            'http://www.example.com/do?a=1&q=a+space')

        # normalize percent-encoding case (in paths)
        self.assertEqual(canonicalize_url('http://www.example.com/a%a3do'),
                         'http://www.example.com/a%A3do'),
        # normalize percent-encoding case (in query arguments)
        self.assertEqual(canonicalize_url('http://www.example.com/do?k=b%a3'),
                         'http://www.example.com/do?k=b%A3')

        # non-ASCII percent-encoding in paths
        self.assertEqual(canonicalize_url('http://www.example.com/a do?a=1'),
                         'http://www.example.com/a%20do?a=1'),
        self.assertEqual(
            canonicalize_url('http://www.example.com/a %20do?a=1'),
            'http://www.example.com/a%20%20do?a=1'),
        self.assertEqual(
            canonicalize_url('http://www.example.com/a do\xc2\xa3.html?a=1'),
            'http://www.example.com/a%20do%C2%A3.html?a=1')
        # non-ASCII percent-encoding in query arguments
        self.assertEqual(
            canonicalize_url(
                u'http://www.example.com/do?price=\xa3500&a=5&z=3'),
            u'http://www.example.com/do?a=5&price=%C2%A3500&z=3')
        self.assertEqual(
            canonicalize_url(
                'http://www.example.com/do?price=\xc2\xa3500&a=5&z=3'),
            'http://www.example.com/do?a=5&price=%C2%A3500&z=3')
        self.assertEqual(
            canonicalize_url(
                'http://www.example.com/do?price(\xc2\xa3)=500&a=1'),
            'http://www.example.com/do?a=1&price%28%C2%A3%29=500')

        # urls containing auth and ports
        self.assertEqual(
            canonicalize_url(u'http://*****:*****@www.example.com:81/do?now=1'),
            u'http://*****:*****@www.example.com:81/do?now=1')

        # remove fragments
        self.assertEqual(
            canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag'),
            u'http://*****:*****@www.example.com/do?a=1')
        self.assertEqual(
            canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag',
                             keep_fragments=True),
            u'http://*****:*****@www.example.com/do?a=1#frag')

        # dont convert safe characters to percent encoding representation
        self.assertEqual(
            canonicalize_url(
                'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html'
            ),
            'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html'
        )

        # relative paths in url
        self.assertEqual(canonicalize_url('http://www.test.com/./a/./b/../c'),
                         'http://www.test.com/a/c')

        # urllib.quote uses a mapping cache of encoded characters. when parsing
        # an already percent-encoded url, it will fail if that url was not
        # percent-encoded as utf-8, that's why canonicalize_url must always
        # convert the urls to string. the following test asserts that
        # functionality.
        self.assertEqual(
            canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
            'http://www.example.com/caf%E9-con-leche.htm')

        # domains are case insensitive
        self.assertEqual(canonicalize_url('http://www.EXAMPLE.com/'),
                         'http://www.example.com/')

        # quoted slash and question sign
        self.assertEqual(
            canonicalize_url('http://foo.com/AC%2FDC+rocks%3f/?yeah=1'),
            'http://foo.com/AC%2FDC+rocks%3F/?yeah=1')
        self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC/'),
                         'http://foo.com/AC%2FDC/')

        # utm tags
        self.assertEqual(
            canonicalize_url(
                'http://gh.com/test?msg=hello&utm_source=gh&utm_medium=320banner&utm_campaign=bpp'
            ), 'http://gh.com/test?msg=hello')
        self.assertEqual(
            canonicalize_url(
                'http://gh.com/test?msg=hello#utm_source=gh&utm_medium=320banner&utm_campaign=bpp',
                keep_fragments=True), 'http://gh.com/test?msg=hello')
        # when fragment is not query-like, keep utm
        self.assertEqual(
            canonicalize_url('http://gh.com/test?msg=hello#a;b;utm_source=gh',
                             keep_fragments=True),
            'http://gh.com/test?msg=hello#a;b;utm_source=gh')

        # stip www prefix
        self.assertEqual(
            canonicalize_url('http://www.example.com/', strip_www=True),
            'http://example.com/')
        self.assertEqual(
            canonicalize_url('http://example.com/', strip_www=True),
            'http://example.com/')
        self.assertEqual(
            canonicalize_url('http://www2.example.com/', strip_www=True),
            'http://www2.example.com/')
        self.assertEqual(
            canonicalize_url('http://www.www.example.com/', strip_www=True),
            'http://www.example.com/')
        self.assertEqual(
            canonicalize_url('http://*****:*****@www.example.com/',
                             strip_www=True), 'http://*****:*****@example.com/')
        self.assertEqual(
            canonicalize_url('http://*****:*****@example.com/', strip_www=True),
            'http://*****:*****@example.com/')