예제 #1
0
 def hash_factory(seed):
     if universe_size is None:
         fun = lambda x: CityHash64WithSeed(repr(x), seed)
     else:
         fun = lambda x: CityHash64WithSeed(repr(x), seed
                                            ) % universe_size
     return fun
예제 #2
0
 def _hash_fun_64(item, seed=0):
     type_of_x = type(item)
     if type_of_x == str:
         value = item
     elif type_of_x == unicode:
         value = item.encode("utf-8")
     else:
         value = repr(item)
     return CityHash64WithSeed(value, seed)
예제 #3
0
    def minhashSignature(self, html):
        words = self.parseHtml(html)
        shingles = self.shingle(words)

        signature = set()
        for i in range(self.permutations):
            minhash_value = self.max_hash_value
            for shingle in shingles:
                temp = CityHash64WithSeed(shingle, i)
                if minhash_value > temp:
                    minhash_value = temp

            signature.add(minhash_value)

        return signature
예제 #4
0
 def test_unicode_2_64(self):
     """Accepts Unicode input outside of ASCII range"""
     test_case = u'\u2661'
     self.assertTrue(isinstance(CityHash64WithSeed(test_case), long))
예제 #5
0
 def test_unicode_1_64(self):
     """Accepts Unicode input"""
     test_case = u"abc"
     self.assertTrue(isinstance(CityHash64WithSeed(test_case), long))
예제 #6
0
 def test_consistent_encoding_64(self):
     """ASCII-range Unicode strings have the same hash values as ASCII strings
     """
     text = u"abracadabra"
     self.assertEqual(CityHash64WithSeed(text),
                      CityHash64WithSeed(text.encode("utf-8")))
예제 #7
0
 def test_string_unicode_64(self):
     """Empty Python string has same hash value as empty Unicode string
     """
     self.assertEqual(CityHash64WithSeed(""), CityHash64WithSeed(u""))