def test_double_hash_singularity(self): singular_ngrams = ["635", "1402"] non_singular_ngrams = ["666", "1401"] for ngram in singular_ngrams: bf = double_hash_encode_ngrams([ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii') self.assertEqual(bf.count(), 1) bf_ns = double_hash_encode_ngrams_non_singular( [ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii') self.assertGreater(bf_ns.count(), 1) self.assertNotEqual(bf, bf_ns) for ngram in non_singular_ngrams: bf = double_hash_encode_ngrams([ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii') self.assertGreater(bf.count(), 1) bf_ns = double_hash_encode_ngrams_non_singular( [ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii') self.assertGreater(bf_ns.count(), 1) self.assertEqual(bf, bf_ns)
def test_order_of_ngrams(self): self._test_order_of_ngrams( lambda ngrams: blake_encode_ngrams(ngrams, (self.key_sha1, ), self. ks, 1024, 'ascii'), copy(self.ngrams)) self._test_order_of_ngrams( lambda ngrams: double_hash_encode_ngrams(ngrams, ( self.key_sha1, self.key_md5), self.ks, 1024, 'ascii'), copy(self.ngrams)) self._test_order_of_ngrams( lambda ngrams: double_hash_encode_ngrams_non_singular( ngrams, (self.key_sha1, self.key_md5), self.ks, 1024, 'ascii'), copy(self.ngrams))
def test_bug210(self): # https://github.com/data61/clkhash/issues/210 common_tokens = [str(i) for i in range(65)] e1 = common_tokens + ['e1'] # 66 tokens e2 = common_tokens + ['e2a', 'e2b'] # 67 tokens tok_sim = 2.0 * len(common_tokens) / (len(e1) + len(e2)) fhp = FieldHashingProperties(ngram=2, num_bits=100, hash_type='doubleHash') f = lambda tokens: double_hash_encode_ngrams(tokens, ( self.key_sha1, self.key_md5), fhp.ks(len(tokens)), 1024, fhp. encoding) b1 = f(e1) b2 = f(e2) intersect = b1 & b2 sim = 2.0 * intersect.count() / (b1.count() + b2.count()) # print('test_bug210: bit counts: b1 = {}, b2 = {}, intersect = {}' # ', tok_sim = {}, sim = {}' # .format(b1.count(), # b2.count(), # intersect.count(), # tok_sim, sim)) self.assertGreater(sim, 0.9 * tok_sim)
def test_double_hash_encoding(self): bf = double_hash_encode_ngrams(self.ngrams, (self.key_sha1, self.key_md5), self.ks, 1024, 'ascii') self._test_bit_range(bf.count(), self.k, len(self.ngrams))