Example #1
0
    def _set_canonical(self, obj):
        if b'redirect_urls' in obj.meta:
            # if home page is requested then leave the target page as canonical
            urls = obj.meta[b'redirect_urls']
            scheme, netloc, path, params, query, fragment = parse_url(urls[0])
            if not path or path in ['/', 'index.html', 'index.htm', 'default.htm']:
                return

            # check if redirect is within the same hostname
            target = parse_url(obj.url)
            src_hostname, _, _ = netloc.partition(':')
            trg_hostname, _, _ = target.netloc.partition(':')
            if src_hostname == trg_hostname:
                return

            # otherwise default behavior
            super(CorporateWebsiteFriendly, self)._set_canonical(obj)
Example #2
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    hostname = result.hostname if result.hostname else '-'
    host_checksum = get_crc32(hostname)
    combined = hostname+result.path+';'+result.params+result.query+result.fragment

    combined = to_bytes(combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return fprint
Example #3
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    if not result.hostname:
        return sha1(key)
    host_checksum = get_crc32(result.hostname)
    doc_uri_combined = result.path+';'+result.params+result.query+result.fragment

    doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(doc_uri_combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return to_native_str(fprint, 'utf8')
Example #4
0
 def test_already_parsed(self):
     result = parse_url(simple_url)
     self.assertEqual(parse_url(result), result)
Example #5
0
 def test_complete_url(self):
     self.assertEqual(parse_url(complete_url),
                      ('http', 'username:[email protected]:80',
                       '/some/page/do', '', 'a=1&b=2&c=3', 'frag'))
Example #6
0
 def test_simple_url(self):
     self.assertEqual(parse_url(simple_url),
                      ('http', 'www.example.com', '', '', '', ''))
Example #7
0
 def test_simple_url(self):
     self.assertEqual(parse_url(simple_url),
                      ('http', 'www.example.com', '', '', '', ''))
Example #8
0
 def test_already_parsed(self):
     result = parse_url(simple_url)
     self.assertEqual(parse_url(result), result)
Example #9
0
 def test_complete_url(self):
     self.assertEqual(parse_url(complete_url),
                      ('http', 'username:[email protected]:80',
                       '/some/page/do', '', 'a=1&b=2&c=3', 'frag'))