def setUp(self): ascii_hashing = FieldHashingProperties( encoding='ascii', comparator=get_comparator({ 'type': 'ngram', 'n': 2 }), strategy=BitsPerTokenStrategy(20)) self.fields = [ StringSpec(identifier='given name', case='lower', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='surname', case='upper', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='email address', regex=r'.+@.+\..+', hashing_properties=ascii_hashing), IntegerSpec(identifier='age', minimum=18, maximum=99, hashing_properties=ascii_hashing), DateSpec(identifier='join date', format='%Y-%m-%d', hashing_properties=ascii_hashing), EnumSpec(identifier='account type', values=['free', 'paid'], hashing_properties=ascii_hashing) ]
def fhp_from_json_dict(json_dict # type: Dict[str, Any] ): # type: (...) -> FieldHashingProperties """ Make a :class:`FieldHashingProperties` object from a dictionary. :param dict json_dict: Conforming to the `hashingConfig` definition in the `v2` linkage schema. :return: A :class:`FieldHashingProperties` instance. """ h = json_dict.get('hash', {'type': 'blakeHash'}) if json_dict['comparison'].get('type', '') == 'ngram': # setting default json_dict['comparison'].setdefault( 'positional', FieldHashingProperties._DEFAULT_POSITIONAL) comparator = comparators.get_comparator(json_dict['comparison']) return FieldHashingProperties( comparator=comparator, hash_type=h['type'], prevent_singularity=h.get('prevent_singularity'), strategy=StrategySpec.from_json_dict(json_dict['strategy']), missing_value=MissingValueSpec.from_json_dict( json_dict['missingValue']) if 'missingValue' in json_dict else None)
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties( comparator=comparators.get_comparator({ 'type': 'ngram', 'n': 2 }), hash_type='doubleHash', strategy=BitsPerTokenStrategy(bits_per_token=10)) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] secret = "No, I am your father. No... that's not true! That's impossible!".encode( ) keys_hkdf = generate_key_lists(secret, len(row), kdf='HKDF') keys_legacy = generate_key_lists(secret, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.strategy.bits_per_token(1)[0] * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
import base64 import os import unittest from clkhash import bloomfilter, clk, randomnames, schema from clkhash.field_formats import FieldHashingProperties, StringSpec, BitsPerTokenStrategy, BitsPerFeatureStrategy from clkhash.key_derivation import generate_key_lists from clkhash.schema import Schema from clkhash.serialization import deserialize_bitarray from clkhash.stats import OnlineMeanVariance from clkhash.comparators import get_comparator TEST_DATA_DIRECTORY = os.path.join(os.path.dirname(__file__), 'testdata') bigram_tokenizer = get_comparator({'type': 'ngram', 'n': 2}) def _test_data_file_path(file_name): return os.path.join(TEST_DATA_DIRECTORY, file_name) def _test_schema(file_name): with open(_test_data_file_path(file_name)) as f: return schema.from_json_file(f) def _test_stats(pii, schema, keys): counts = [deserialize_bitarray(c).count() for c in clk.generate_clks(pii, schema, keys)] print('_test_stats: counts = ', counts) ov = OnlineMeanVariance() ov.update(counts)
def test_invalid_n(): with pytest.raises(ValueError): comparators.get_comparator({'type': 'ngram', 'n': -6})
def test_invalid_comparison(): with pytest.raises(ValueError): comparators.get_comparator({"type": "apples_and_oranges"})