Example #1
0
 def test_invalid_n(self):
     fhp = FieldHashingProperties(ngram=2, k=20, positional=True)
     fhp.ngram = -6
     with self.assertRaises(
             ValueError,
             msg='Expected raise ValueError on invalid n.'):
         tok = get_tokenizer(fhp)
         tok('prawn')
Example #2
0
    def test_compare_strategies(self):
        def mkSchema(hashing_properties):
            return Schema(
                l=1024,
                xor_folds=1,
                kdf_type='HKDF',
                kdf_hash='SHA256',
                kdf_salt=base64.b64decode(
                    'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                    '/G5nUBrM7ybymlEFsMV6PAeDZCNp3r'
                    'fNUPCtLDMOGQHG4pCQpfhiHCyA=='),
                kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
                kdf_key_size=64,
                fields=[
                    StringSpec(
                        identifier='name',
                        hashing_properties=hashing_properties,
                        description=None,
                        case=StringSpec._DEFAULT_CASE,
                        min_length=1,
                        max_length=50
                    )
                ]
            )

        pii = [('An',), ('Fred',), ('Philhowe',), ('MuhlbachBereznyz',)]
        secret = 'secret'

        schema_k = mkSchema(FieldHashingProperties(
            encoding=FieldHashingProperties._DEFAULT_ENCODING,
            comparator=bigram_tokenizer,
            strategy=BitsPerTokenStrategy(20),
            hash_type='doubleHash'
        ))

        mean_k, std_k = _test_stats(pii, schema_k, secret)
        print('test_compare_k_and_num_bits k: ', mean_k, std_k)

        schema_num_bits = mkSchema(FieldHashingProperties(
            encoding=FieldHashingProperties._DEFAULT_ENCODING,
            comparator=bigram_tokenizer,
            strategy=BitsPerFeatureStrategy(int(round(mean_k))),
            hash_type='doubleHash'
        ))
        mean_num_bits, std_num_bits = _test_stats(pii, schema_num_bits, secret)
        print('test_compare_k_and_num_bits num_bits: ', mean_num_bits,
              std_num_bits)

        self.assertGreater(std_k, 2 * std_num_bits,
                           'Standard deviation for num_bits should be'
                           ' < half that for the equivalent k')
Example #3
0
 def test_different_weights(self):
     schema = Schema(
         version=1,
         hashing_globals=GlobalHashingProperties(
             k=30,
             kdf_hash='SHA256',
             kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
             kdf_key_size=64,
             kdf_salt=base64.b64decode(
                 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='
             ),
             kdf_type='HKDF',
             l=1024,
             hash_type='blakeHash',
             xor_folds=0,
         ),
         fields=[
             StringSpec(
                 identifier='some info',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None)
         ])
Example #4
0
 def setUp(self):
     ascii_hashing = FieldHashingProperties(
         encoding='ascii',
         comparator=get_comparator({
             'type': 'ngram',
             'n': 2
         }),
         strategy=BitsPerTokenStrategy(20))
     self.fields = [
         StringSpec(identifier='given name',
                    case='lower',
                    min_length=1,
                    max_length=None,
                    hashing_properties=ascii_hashing),
         StringSpec(identifier='surname',
                    case='upper',
                    min_length=1,
                    max_length=None,
                    hashing_properties=ascii_hashing),
         StringSpec(identifier='email address',
                    regex=r'.+@.+\..+',
                    hashing_properties=ascii_hashing),
         IntegerSpec(identifier='age',
                     minimum=18,
                     maximum=99,
                     hashing_properties=ascii_hashing),
         DateSpec(identifier='join date',
                  format='%Y-%m-%d',
                  hashing_properties=ascii_hashing),
         EnumSpec(identifier='account type',
                  values=['free', 'paid'],
                  hashing_properties=ascii_hashing)
     ]
Example #5
0
    def test_compare_to_legacy(self):
        # Identifier: 'ANY freetext'

        fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10)

        schema = Schema(
            l=1024,
            kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
            kdf_key_size=64,
            kdf_salt=base64.b64decode(
                'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
            fields=[
                StringSpec(identifier='ANY text {}'.format(i + 1),
                           hashing_properties=fhp) for i in range(4)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will
        # map each Bobby to different bits.
        self.assertLessEqual(legacy_count, fhp.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)
Example #6
0
 def test_different_weights(self):
     schema = Schema(
         l=1024,
         xor_folds=0,
         kdf_hash='SHA256',
         kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
         kdf_key_size=64,
         kdf_salt=base64.b64decode(
             'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
             '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
         kdf_type='HKDF',
         fields=[
             StringSpec(
                 identifier='some info',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     comparator=bigram_tokenizer,
                     strategy=BitsPerTokenStrategy(20)
                 ),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None
             )
         ]
     )
Example #7
0
 def test_from_properties_invalid_hash(self):
     fhp = FieldHashingProperties(
         comparator=bigram_tokenizer,
         strategy=BitsPerTokenStrategy(30),
         hash_type='jakubHash'  # <- this is invalid.
     )
     with self.assertRaises(ValueError,
                            msg='Expected ValueError on invalid encoding.'):
         hashing_function_from_properties(fhp)
Example #8
0
 def test_from_properties_invalid_hash(self):
     fhp = FieldHashingProperties(
         ngram=2,
         k=30,
         hash_type='jakubHash'  # <- this is invalid.
     )
     with self.assertRaises(ValueError,
                            msg='Expected ValueError on invalid encoding.'):
         hashing_function_from_properties(fhp)
Example #9
0
    def test_bug210(self):
        # https://github.com/data61/clkhash/issues/210
        common_tokens = [str(i) for i in range(65)]
        e1 = common_tokens + ['e1']  # 66 tokens
        e2 = common_tokens + ['e2a', 'e2b']  # 67 tokens
        tok_sim = 2.0 * len(common_tokens) / (len(e1) + len(e2))

        fhp = FieldHashingProperties(ngram=2,
                                     num_bits=100,
                                     hash_type='doubleHash')
        f = lambda tokens: double_hash_encode_ngrams(tokens, (
            self.key_sha1, self.key_md5), fhp.ks(len(tokens)), 1024, fhp.
                                                     encoding)
        b1 = f(e1)
        b2 = f(e2)
        intersect = b1 & b2
        sim = 2.0 * intersect.count() / (b1.count() + b2.count())
        # print('test_bug210: bit counts: b1 = {}, b2 = {}, intersect = {}'
        #       ', tok_sim = {}, sim = {}'
        #       .format(b1.count(),
        #               b2.count(),
        #               intersect.count(),
        #               tok_sim, sim))
        self.assertGreater(sim, 0.9 * tok_sim)
Example #10
0
 def test_positional_unigram_duplicate(self):
     properties = FieldHashingProperties(ngram=1, positional=True)
     self.assertEqual(list(get_tokenizer(properties)("111")),
                      ['1 1', '2 1', '3 1'])
Example #11
0
 def test_unigram_duplicate(self):
     properties = FieldHashingProperties(ngram=1, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("1212")),
                      ['1', '2', '1', '2'])
Example #12
0
 def test_bigram_2(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("steve", ignore='e')),
                      [' s', 'st', 'tv', 'v '])
Example #13
0
 def test_bigram_1(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("steve")),
                      [' s', 'st', 'te', 'ev', 've', 'e '])
Example #14
0
 def test_bigram_duplicate(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("abab")),
                      [' a', 'ab', 'ba', 'ab', 'b '])
Example #15
0
 def test_unigram_1(self):
     properties = FieldHashingProperties(ngram=1, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("1/2/93", ignore='/')),
                      ['1', '2', '9', '3'])
Example #16
0
import unittest

from clkhash.field_formats import FieldHashingProperties
from clkhash.tokenizer import get_tokenizer

__author__ = 'shardy'

# some tokenizers

p1_20 = get_tokenizer(
    FieldHashingProperties(ngram=1, k=20)
)

p2_20 = get_tokenizer(
    FieldHashingProperties(ngram=2, k=20)
)

p1_20_true = get_tokenizer(
    FieldHashingProperties(ngram=1, k=20, positional=True)
)

dummy = get_tokenizer(None)

class TestTokenizer(unittest.TestCase):

    def test_unigram_1(self):
        self.assertEqual(list(p1_20("1/2/93", ignore='/')),
                         ['1', '2', '9', '3'])

    def test_unigram_2(self):
        self.assertEqual(list(p1_20("1*2*93", ignore='*')),
Example #17
0
 def test_compare_to_legacy(self):
     # Identifier: 'ANY freetext'
     schema = Schema(
         version=1,
         hashing_globals=GlobalHashingProperties(
             k=10,
             kdf_hash='SHA256',
             kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
             kdf_key_size=64,
             kdf_salt=base64.b64decode(
                 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='
             ),
             kdf_type='HKDF',
             l=1024,
             hash_type='doubleHash',
             hash_prevent_singularity=False,
             xor_folds=0),
         fields=[
             StringSpec(
                 identifier='ANY text 1',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 2',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 3',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 4',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None)
         ])
Example #18
0
 def test_positional_unigram_2(self):
     properties = FieldHashingProperties(ngram=1, positional=True)
     self.assertEqual(list(get_tokenizer(properties)("1*2*")),
                      ['1 1', '2 *', '3 2', '4 *'])