def setUp(self): ascii_hashing = FieldHashingProperties( encoding='ascii', comparator=get_comparator({ 'type': 'ngram', 'n': 2 }), strategy=BitsPerTokenStrategy(20)) self.fields = [ StringSpec(identifier='given name', case='lower', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='surname', case='upper', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='email address', regex=r'.+@.+\..+', hashing_properties=ascii_hashing), IntegerSpec(identifier='age', minimum=18, maximum=99, hashing_properties=ascii_hashing), DateSpec(identifier='join date', format='%Y-%m-%d', hashing_properties=ascii_hashing), EnumSpec(identifier='account type', values=['free', 'paid'], hashing_properties=ascii_hashing) ]
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
def test_different_weights(self): schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=30, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='blakeHash', xor_folds=0, ), fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])
def test_different_weights(self): schema = Schema( l=1024, xor_folds=0, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_type='HKDF', fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(20) ), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None ) ] )
def mkSchema(hashing_properties): return Schema(l=1024, xor_folds=1, kdf_type='HKDF', kdf_hash='SHA256', kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3r' 'fNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, fields=[ StringSpec(identifier='name', hashing_properties=hashing_properties, description=None, case=StringSpec._DEFAULT_CASE, min_length=1, max_length=50) ])
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=10, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='doubleHash', hash_prevent_singularity=False, xor_folds=0), fields=[ StringSpec( identifier='ANY text 1', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 2', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 3', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 4', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])