Ejemplo n.º 1
0
 def test_double_hash_singularity(self):
     singular_ngrams = ["635", "1402"]
     non_singular_ngrams = ["666", "1401"]
     for ngram in singular_ngrams:
         bf = double_hash_encode_ngrams([ngram], (b'secret1', b'secret2'),
                                        20, 1024, 'ascii')
         self.assertEqual(bf.count(), 1)
         bf_ns = double_hash_encode_ngrams_non_singular(
             [ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii')
         self.assertGreater(bf_ns.count(), 1)
         self.assertNotEqual(bf, bf_ns)
     for ngram in non_singular_ngrams:
         bf = double_hash_encode_ngrams([ngram], (b'secret1', b'secret2'),
                                        20, 1024, 'ascii')
         self.assertGreater(bf.count(), 1)
         bf_ns = double_hash_encode_ngrams_non_singular(
             [ngram], (b'secret1', b'secret2'), 20, 1024, 'ascii')
         self.assertGreater(bf_ns.count(), 1)
         self.assertEqual(bf, bf_ns)
Ejemplo n.º 2
0
 def test_order_of_ngrams(self):
     self._test_order_of_ngrams(
         lambda ngrams: blake_encode_ngrams(ngrams, (self.key_sha1, ), self.
                                            ks, 1024, 'ascii'),
         copy(self.ngrams))
     self._test_order_of_ngrams(
         lambda ngrams: double_hash_encode_ngrams(ngrams, (
             self.key_sha1, self.key_md5), self.ks, 1024, 'ascii'),
         copy(self.ngrams))
     self._test_order_of_ngrams(
         lambda ngrams: double_hash_encode_ngrams_non_singular(
             ngrams, (self.key_sha1, self.key_md5), self.ks, 1024, 'ascii'),
         copy(self.ngrams))
Ejemplo n.º 3
0
    def test_bug210(self):
        # https://github.com/data61/clkhash/issues/210
        common_tokens = [str(i) for i in range(65)]
        e1 = common_tokens + ['e1']  # 66 tokens
        e2 = common_tokens + ['e2a', 'e2b']  # 67 tokens
        tok_sim = 2.0 * len(common_tokens) / (len(e1) + len(e2))

        fhp = FieldHashingProperties(ngram=2,
                                     num_bits=100,
                                     hash_type='doubleHash')
        f = lambda tokens: double_hash_encode_ngrams(tokens, (
            self.key_sha1, self.key_md5), fhp.ks(len(tokens)), 1024, fhp.
                                                     encoding)
        b1 = f(e1)
        b2 = f(e2)
        intersect = b1 & b2
        sim = 2.0 * intersect.count() / (b1.count() + b2.count())
        # print('test_bug210: bit counts: b1 = {}, b2 = {}, intersect = {}'
        #       ', tok_sim = {}, sim = {}'
        #       .format(b1.count(),
        #               b2.count(),
        #               intersect.count(),
        #               tok_sim, sim))
        self.assertGreater(sim, 0.9 * tok_sim)
Ejemplo n.º 4
0
 def test_double_hash_encoding(self):
     bf = double_hash_encode_ngrams(self.ngrams,
                                    (self.key_sha1, self.key_md5), self.ks,
                                    1024, 'ascii')
     self._test_bit_range(bf.count(), self.k, len(self.ngrams))