def request_fingerprint(request, include_headers=None): '''Return the request fingerprint. The request fingerprint is a hash that uniquely identifies the resource the request points to. For example, take the following two urls: http://www.example.com/query?id=111&cat=222 http://www.example.com/query?cat=222&id=111 Even though those are two different URLs both point to the same resource and are equivalent (i.e. they should return the same response). Another example are cookies used to store session ids. Suppose the following page is only accesible to authenticated users: http://www.example.com/members/offers.html Lot of sites use a cookie to store the session id, which adds a random component to the HTTP Request and thus should be ignored when calculating the fingerprint. For this reason, request headers are ignored by default when calculating the fingeprint. If you want to include specific headers use the include_headers argument, which is a list of Request headers to include. ''' if include_headers: include_headers = tuple([h.lower() for h in sorted(include_headers)]) cache = _fingerprint_cache.setdefault(request, {}) if include_headers not in cache: fp = hashlib.sha1() fp.update(request.method) fp.update(canonicalize_url(request.url)) fp.update(request.body or '') if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[include_headers] = fp.hexdigest() return cache[include_headers]
def test_canonicalize_url(self): # simplest case self.assertEqual(canonicalize_url('http://www.example.com/'), 'http://www.example.com/') # always return a str assert isinstance(canonicalize_url(u'http://www.example.com'), str) # append missing path self.assertEqual(canonicalize_url('http://www.example.com'), 'http://www.example.com/') # typical usage self.assertEqual(canonicalize_url('http://www.example.com/do?a=1&b=2&c=3'), 'http://www.example.com/do?a=1&b=2&c=3') self.assertEqual(canonicalize_url('http://www.example.com/do?c=1&b=2&a=3'), 'http://www.example.com/do?a=3&b=2&c=1') self.assertEqual(canonicalize_url('http://www.example.com/do?&a=1'), 'http://www.example.com/do?a=1') # sorting by argument values self.assertEqual(canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50'), 'http://www.example.com/do?a=50&b=2&b=5&c=3') # using keep_blank_values self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2', keep_blank_values=False), 'http://www.example.com/do?a=2') self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2'), 'http://www.example.com/do?a=2&b=') self.assertEqual(canonicalize_url('http://www.example.com/do?b=&c&a=2', keep_blank_values=False), 'http://www.example.com/do?a=2') self.assertEqual(canonicalize_url('http://www.example.com/do?b=&c&a=2'), 'http://www.example.com/do?a=2&b=&c=') self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=') # spaces self.assertEqual(canonicalize_url('http://www.example.com/do?q=a space&a=1'), 'http://www.example.com/do?a=1&q=a+space') self.assertEqual(canonicalize_url('http://www.example.com/do?q=a+space&a=1'), 'http://www.example.com/do?a=1&q=a+space') self.assertEqual(canonicalize_url('http://www.example.com/do?q=a%20space&a=1'), 'http://www.example.com/do?a=1&q=a+space') # normalize percent-encoding case (in paths) self.assertEqual(canonicalize_url('http://www.example.com/a%a3do'), 'http://www.example.com/a%A3do'), # normalize percent-encoding case (in query arguments) self.assertEqual(canonicalize_url('http://www.example.com/do?k=b%a3'), 'http://www.example.com/do?k=b%A3') # non-ASCII percent-encoding in paths self.assertEqual(canonicalize_url('http://www.example.com/a do?a=1'), 'http://www.example.com/a%20do?a=1'), self.assertEqual(canonicalize_url('http://www.example.com/a %20do?a=1'), 'http://www.example.com/a%20%20do?a=1'), self.assertEqual(canonicalize_url('http://www.example.com/a do\xc2\xa3.html?a=1'), 'http://www.example.com/a%20do%C2%A3.html?a=1') # non-ASCII percent-encoding in query arguments self.assertEqual(canonicalize_url(u'http://www.example.com/do?price=\xa3500&a=5&z=3'), u'http://www.example.com/do?a=5&price=%C2%A3500&z=3') self.assertEqual(canonicalize_url('http://www.example.com/do?price=\xc2\xa3500&a=5&z=3'), 'http://www.example.com/do?a=5&price=%C2%A3500&z=3') self.assertEqual(canonicalize_url('http://www.example.com/do?price(\xc2\xa3)=500&a=1'), 'http://www.example.com/do?a=1&price%28%C2%A3%29=500') # urls containing auth and ports self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com:81/do?now=1'), u'http://*****:*****@www.example.com:81/do?now=1') # remove fragments self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag'), u'http://*****:*****@www.example.com/do?a=1') self.assertEqual(canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag', keep_fragments=True), u'http://*****:*****@www.example.com/do?a=1#frag') # dont convert safe characters to percent encoding representation self.assertEqual(canonicalize_url( 'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html'), 'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html') # relative paths in url self.assertEqual(canonicalize_url( 'http://www.test.com/./a/./b/../c'), 'http://www.test.com/a/c') # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always # convert the urls to string. the following test asserts that # functionality. self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), 'http://www.example.com/caf%E9-con-leche.htm') # domains are case insensitive self.assertEqual(canonicalize_url('http://www.EXAMPLE.com/'), 'http://www.example.com/') # quoted slash and question sign self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC+rocks%3f/?yeah=1'), 'http://foo.com/AC%2FDC+rocks%3F/?yeah=1') self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC/'), 'http://foo.com/AC%2FDC/') # utm tags self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello&utm_source=gh&utm_medium=320banner&utm_campaign=bpp'), 'http://gh.com/test?msg=hello') self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello#utm_source=gh&utm_medium=320banner&utm_campaign=bpp', keep_fragments=True), 'http://gh.com/test?msg=hello') # when fragment is not query-like, keep utm self.assertEqual(canonicalize_url('http://gh.com/test?msg=hello#a;b;utm_source=gh', keep_fragments=True), 'http://gh.com/test?msg=hello#a;b;utm_source=gh') # stip www prefix self.assertEqual(canonicalize_url( 'http://www.example.com/', strip_www=True), 'http://example.com/') self.assertEqual(canonicalize_url( 'http://example.com/', strip_www=True), 'http://example.com/') self.assertEqual(canonicalize_url( 'http://www2.example.com/', strip_www=True), 'http://www2.example.com/') self.assertEqual(canonicalize_url( 'http://www.www.example.com/', strip_www=True), 'http://www.example.com/') self.assertEqual(canonicalize_url( 'http://*****:*****@www.example.com/', strip_www=True), 'http://*****:*****@example.com/') self.assertEqual(canonicalize_url( 'http://*****:*****@example.com/', strip_www=True), 'http://*****:*****@example.com/')
def test_canonicalize_url(self): # simplest case self.assertEqual(canonicalize_url('http://www.example.com/'), 'http://www.example.com/') # always return a str assert isinstance(canonicalize_url(u'http://www.example.com'), str) # append missing path self.assertEqual(canonicalize_url('http://www.example.com'), 'http://www.example.com/') # typical usage self.assertEqual( canonicalize_url('http://www.example.com/do?a=1&b=2&c=3'), 'http://www.example.com/do?a=1&b=2&c=3') self.assertEqual( canonicalize_url('http://www.example.com/do?c=1&b=2&a=3'), 'http://www.example.com/do?a=3&b=2&c=1') self.assertEqual(canonicalize_url('http://www.example.com/do?&a=1'), 'http://www.example.com/do?a=1') # sorting by argument values self.assertEqual( canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50'), 'http://www.example.com/do?a=50&b=2&b=5&c=3') # using keep_blank_values self.assertEqual( canonicalize_url('http://www.example.com/do?b=&a=2', keep_blank_values=False), 'http://www.example.com/do?a=2') self.assertEqual(canonicalize_url('http://www.example.com/do?b=&a=2'), 'http://www.example.com/do?a=2&b=') self.assertEqual( canonicalize_url('http://www.example.com/do?b=&c&a=2', keep_blank_values=False), 'http://www.example.com/do?a=2') self.assertEqual( canonicalize_url('http://www.example.com/do?b=&c&a=2'), 'http://www.example.com/do?a=2&b=&c=') self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=') # spaces self.assertEqual( canonicalize_url('http://www.example.com/do?q=a space&a=1'), 'http://www.example.com/do?a=1&q=a+space') self.assertEqual( canonicalize_url('http://www.example.com/do?q=a+space&a=1'), 'http://www.example.com/do?a=1&q=a+space') self.assertEqual( canonicalize_url('http://www.example.com/do?q=a%20space&a=1'), 'http://www.example.com/do?a=1&q=a+space') # normalize percent-encoding case (in paths) self.assertEqual(canonicalize_url('http://www.example.com/a%a3do'), 'http://www.example.com/a%A3do'), # normalize percent-encoding case (in query arguments) self.assertEqual(canonicalize_url('http://www.example.com/do?k=b%a3'), 'http://www.example.com/do?k=b%A3') # non-ASCII percent-encoding in paths self.assertEqual(canonicalize_url('http://www.example.com/a do?a=1'), 'http://www.example.com/a%20do?a=1'), self.assertEqual( canonicalize_url('http://www.example.com/a %20do?a=1'), 'http://www.example.com/a%20%20do?a=1'), self.assertEqual( canonicalize_url('http://www.example.com/a do\xc2\xa3.html?a=1'), 'http://www.example.com/a%20do%C2%A3.html?a=1') # non-ASCII percent-encoding in query arguments self.assertEqual( canonicalize_url( u'http://www.example.com/do?price=\xa3500&a=5&z=3'), u'http://www.example.com/do?a=5&price=%C2%A3500&z=3') self.assertEqual( canonicalize_url( 'http://www.example.com/do?price=\xc2\xa3500&a=5&z=3'), 'http://www.example.com/do?a=5&price=%C2%A3500&z=3') self.assertEqual( canonicalize_url( 'http://www.example.com/do?price(\xc2\xa3)=500&a=1'), 'http://www.example.com/do?a=1&price%28%C2%A3%29=500') # urls containing auth and ports self.assertEqual( canonicalize_url(u'http://*****:*****@www.example.com:81/do?now=1'), u'http://*****:*****@www.example.com:81/do?now=1') # remove fragments self.assertEqual( canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag'), u'http://*****:*****@www.example.com/do?a=1') self.assertEqual( canonicalize_url(u'http://*****:*****@www.example.com/do?a=1#frag', keep_fragments=True), u'http://*****:*****@www.example.com/do?a=1#frag') # dont convert safe characters to percent encoding representation self.assertEqual( canonicalize_url( 'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html' ), 'http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html' ) # relative paths in url self.assertEqual(canonicalize_url('http://www.test.com/./a/./b/../c'), 'http://www.test.com/a/c') # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always # convert the urls to string. the following test asserts that # functionality. self.assertEqual( canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), 'http://www.example.com/caf%E9-con-leche.htm') # domains are case insensitive self.assertEqual(canonicalize_url('http://www.EXAMPLE.com/'), 'http://www.example.com/') # quoted slash and question sign self.assertEqual( canonicalize_url('http://foo.com/AC%2FDC+rocks%3f/?yeah=1'), 'http://foo.com/AC%2FDC+rocks%3F/?yeah=1') self.assertEqual(canonicalize_url('http://foo.com/AC%2FDC/'), 'http://foo.com/AC%2FDC/') # utm tags self.assertEqual( canonicalize_url( 'http://gh.com/test?msg=hello&utm_source=gh&utm_medium=320banner&utm_campaign=bpp' ), 'http://gh.com/test?msg=hello') self.assertEqual( canonicalize_url( 'http://gh.com/test?msg=hello#utm_source=gh&utm_medium=320banner&utm_campaign=bpp', keep_fragments=True), 'http://gh.com/test?msg=hello') # when fragment is not query-like, keep utm self.assertEqual( canonicalize_url('http://gh.com/test?msg=hello#a;b;utm_source=gh', keep_fragments=True), 'http://gh.com/test?msg=hello#a;b;utm_source=gh') # stip www prefix self.assertEqual( canonicalize_url('http://www.example.com/', strip_www=True), 'http://example.com/') self.assertEqual( canonicalize_url('http://example.com/', strip_www=True), 'http://example.com/') self.assertEqual( canonicalize_url('http://www2.example.com/', strip_www=True), 'http://www2.example.com/') self.assertEqual( canonicalize_url('http://www.www.example.com/', strip_www=True), 'http://www.example.com/') self.assertEqual( canonicalize_url('http://*****:*****@www.example.com/', strip_www=True), 'http://*****:*****@example.com/') self.assertEqual( canonicalize_url('http://*****:*****@example.com/', strip_www=True), 'http://*****:*****@example.com/')