def _set_canonical(self, obj): if b'redirect_urls' in obj.meta: # if home page is requested then leave the target page as canonical urls = obj.meta[b'redirect_urls'] scheme, netloc, path, params, query, fragment = parse_url(urls[0]) if not path or path in ['/', 'index.html', 'index.htm', 'default.htm']: return # check if redirect is within the same hostname target = parse_url(obj.url) src_hostname, _, _ = netloc.partition(':') trg_hostname, _, _ = target.netloc.partition(':') if src_hostname == trg_hostname: return # otherwise default behavior super(CorporateWebsiteFriendly, self)._set_canonical(obj)
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) hostname = result.hostname if result.hostname else '-' host_checksum = get_crc32(hostname) combined = hostname+result.path+';'+result.params+result.query+result.fragment combined = to_bytes(combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return fprint
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) if not result.hostname: return sha1(key) host_checksum = get_crc32(result.hostname) doc_uri_combined = result.path+';'+result.params+result.query+result.fragment doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(doc_uri_combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return to_native_str(fprint, 'utf8')
def test_already_parsed(self): result = parse_url(simple_url) self.assertEqual(parse_url(result), result)
def test_complete_url(self): self.assertEqual(parse_url(complete_url), ('http', 'username:[email protected]:80', '/some/page/do', '', 'a=1&b=2&c=3', 'frag'))
def test_simple_url(self): self.assertEqual(parse_url(simple_url), ('http', 'www.example.com', '', '', '', ''))