def hash_factory(seed): if universe_size is None: fun = lambda x: CityHash64WithSeed(repr(x), seed) else: fun = lambda x: CityHash64WithSeed(repr(x), seed ) % universe_size return fun
def _hash_fun_64(item, seed=0): type_of_x = type(item) if type_of_x == str: value = item elif type_of_x == unicode: value = item.encode("utf-8") else: value = repr(item) return CityHash64WithSeed(value, seed)
def minhashSignature(self, html): words = self.parseHtml(html) shingles = self.shingle(words) signature = set() for i in range(self.permutations): minhash_value = self.max_hash_value for shingle in shingles: temp = CityHash64WithSeed(shingle, i) if minhash_value > temp: minhash_value = temp signature.add(minhash_value) return signature
def test_unicode_2_64(self): """Accepts Unicode input outside of ASCII range""" test_case = u'\u2661' self.assertTrue(isinstance(CityHash64WithSeed(test_case), long))
def test_unicode_1_64(self): """Accepts Unicode input""" test_case = u"abc" self.assertTrue(isinstance(CityHash64WithSeed(test_case), long))
def test_consistent_encoding_64(self): """ASCII-range Unicode strings have the same hash values as ASCII strings """ text = u"abracadabra" self.assertEqual(CityHash64WithSeed(text), CityHash64WithSeed(text.encode("utf-8")))
def test_string_unicode_64(self): """Empty Python string has same hash value as empty Unicode string """ self.assertEqual(CityHash64WithSeed(""), CityHash64WithSeed(u""))