def test_different_weights(self): schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=30, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='blakeHash', xor_folds=0, ), fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
def test_different_weights(self): schema = Schema( l=1024, xor_folds=0, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_type='HKDF', fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(20) ), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None ) ] )
def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. schema_dict = dict( version=1, clkConfig=dict(l=1024, k=20, hash=dict(type='doubleHash'), kdf=dict(type='HKDF')), features=[ dict(identifier='name', format=dict(type='string', encoding='utf-8'), hashing=dict(ngram=2, missingValue=dict(sentinel='null', replaceWith='Bob'))), dict(identifier='age', format=dict(type='integer'), hashing=dict(ngram=1, missingValue=dict(sentinel='NA', replaceWith='42'))) ]) schema = Schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=schema, keys=('sec1', 'sec2'), validate=True, callback=None) assert len(clks) == 2 assert clks[0] == clks[1]
def mkSchema(hashing_properties): return Schema(l=1024, xor_folds=1, kdf_type='HKDF', kdf_hash='SHA256', kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3r' 'fNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, fields=[ StringSpec(identifier='name', hashing_properties=hashing_properties, description=None, case=StringSpec._DEFAULT_CASE, min_length=1, max_length=50) ])
def test_xor_folding_integration(self): namelist = randomnames.NameList(1) schema_0 = namelist.SCHEMA assert schema_0.hashing_globals.xor_folds == 0 schema_1 = Schema(version=schema_0.version, hashing_globals=copy(schema_0.hashing_globals), fields=schema_0.fields) schema_1.hashing_globals.xor_folds = 1 schema_1.hashing_globals.l //= 2 key_lists = generate_key_lists(('secret', 'sshh'), len(namelist.schema_types)) bf_original, _, _ = next( bloomfilter.stream_bloom_filters(namelist.names, key_lists, schema_0)) bf_folded, _, _ = next( bloomfilter.stream_bloom_filters(namelist.names, key_lists, schema_1)) self.assertEqual( bf_folded, bf_original[:len(bf_original) // 2] ^ bf_original[len(bf_original) // 2:], 'Folded filter is not an XOR of the two halves of the original.')
fields = [ Ignore('rec_id'), StringSpec('given_name', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(200))), StringSpec('surname', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(200))), IntegerSpec('street_number', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(100), missing_value=MissingValueSpec(sentinel=''))), StringSpec('address_1', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))), StringSpec('address_2', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))), StringSpec('suburb', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))), IntegerSpec('postcode', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(100))), StringSpec('state', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))), IntegerSpec('date_of_birth', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(200), missing_value=MissingValueSpec(sentinel=''))), Ignore('soc_sec_id') ] secret = 'secret' schema = Schema(fields, 1024) a_csv.seek(0) clks_a = generate_clk_from_csv(a_csv, secret, schema) print(len(clks_a)) b_csv = io.StringIO() dfB.to_csv(b_csv) b_csv.seek(0) clks_b = generate_clk_from_csv(b_csv, secret, schema) print(len(clks_b)) # find matches
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=10, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='doubleHash', hash_prevent_singularity=False, xor_folds=0), fields=[ StringSpec( identifier='ANY text 1', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 2', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 3', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 4', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])
class NameList: """ Randomly generated PII records. """ with open( os.path.join(os.path.dirname(__file__), 'data', 'randomnames-schema.json')) as f: SCHEMA = Schema.from_json_file(f) del f def __init__(self, n): # type: (int) -> None self.load_names() self.earliest_birthday = datetime(year=1916, month=1, day=1) self.latest_birthday = datetime(year=2016, month=1, day=1) self.names = [person for person in self.generate_random_person(n)] self.all_male_first_names = None # type: Optional[Sequence[str]] self.all_female_first_names = None # type: Optional[Sequence[str]] self.all_last_names = None # type: Optional[Sequence[str]] @property def schema_types(self): # type: () -> Sequence[FieldSpec] return self.SCHEMA.fields def generate_random_person(self, n): # type: (int) -> Iterable[Tuple[str, str, str, str]] """ Generator that yields details on a person with plausible name, sex and age. :yields: Generated data for one person tuple - (id: int, name: str('First Last'), birthdate: str('DD/MM/YYYY'), sex: str('M' | 'F') ) """ assert self.all_male_first_names is not None assert self.all_female_first_names is not None assert self.all_last_names is not None for i in range(n): sex = 'M' if random.random() > 0.5 else 'F' dob = random_date(self.earliest_birthday, self.latest_birthday).strftime("%Y/%m/%d") first_name = random.choice( self.all_male_first_names) if sex == 'M' else random.choice( self.all_female_first_names) last_name = random.choice(self.all_last_names) yield (str(i), first_name + ' ' + last_name, dob, sex) def load_names(self): # type: () -> None """ Loads a name database from package data Uses data files sourced from http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/ """ self.all_male_first_names = load_csv_data('male-first-names.csv') self.all_female_first_names = load_csv_data('female-first-names.csv') self.all_last_names = load_csv_data('CSV_Database_of_Last_Names.csv') def generate_subsets(self, sz, overlap=0.8, subsets=2): # type: (int, float, int) -> Tuple[List, ...] """ Return random subsets with nonempty intersection. The random subsets are of specified size. If an element is common to two subsets, then it is common to all subsets. This overlap is controlled by a parameter. :param sz: size of subsets to generate :param overlap: size of the intersection, as fraction of the subset length :param subsets: number of subsets to generate :raises ValueError: if there aren't sufficiently many names in the list to satisfy the request; more precisely, raises if (1 - subsets) * floor(overlap * sz) + subsets * sz > len(self.names). :return: tuple of subsets """ overlap_sz = int(math.floor(overlap * sz)) unique_sz = sz - overlap_sz # Unique names per subset total_unique_sz = unique_sz * subsets # Uniques in all subsets total_sz = overlap_sz + total_unique_sz if total_sz > len(self.names): msg = 'insufficient names for requested size and overlap' raise ValueError(msg) sset = random.sample(self.names, total_sz) # Overlapping subset, pool of unique names sset_overlap, sset_unique = sset[:overlap_sz], sset[overlap_sz:] assert len(sset_unique) == subsets * unique_sz # Split pool of unique names into `subsets` chunks uniques = (sset_unique[p * unique_sz:(p + 1) * unique_sz] for p in range(subsets)) return tuple(sset_overlap + u for u in uniques)