def test_compare_v1_and_v2(self): pii = randomnames.NameList(100).names schema_v1 = randomnames.NameList.SCHEMA # this v2 schema should be equivalent to the above v1 schema schema_v2 = _test_schema('randomnames-schema-v2.json') keys = ('secret', 'sshh') for clkv1, clkv2 in zip(clk.generate_clks(pii, schema_v1, keys), clk.generate_clks(pii, schema_v2, keys)): self.assertEqual(clkv1, clkv2)
def test_compare_v1_v2_and_v3(self): pii = randomnames.NameList(100).names schema_v3 = randomnames.NameList.SCHEMA # this v2 schema should be equivalent to the above v3 schema schema_v2 = _test_schema('randomnames-schema-v2.json') schema_v1 = _test_schema('randomnames-schema-v1.json') secret = 'secret' for clkv1, clkv2, clkv3 in zip(clk.generate_clks(pii, schema_v1, secret), clk.generate_clks(pii, schema_v2, secret), clk.generate_clks(pii, schema_v3, secret)): self.assertEqual(clkv1, clkv2) self.assertEqual(clkv1, clkv3)
def _test_stats(pii, schema, keys): counts = [deserialize_bitarray(c).count() for c in clk.generate_clks(pii, schema, keys)] print('_test_stats: counts = ', counts) ov = OnlineMeanVariance() ov.update(counts) return ov.mean(), ov.std()
def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. schema_dict = dict( version=1, clkConfig=dict(l=1024, k=20, hash=dict(type='doubleHash'), kdf=dict(type='HKDF')), features=[ dict(identifier='name', format=dict(type='string', encoding='utf-8'), hashing=dict(ngram=2, missingValue=dict(sentinel='null', replaceWith='Bob'))), dict(identifier='age', format=dict(type='integer'), hashing=dict(ngram=1, missingValue=dict(sentinel='NA', replaceWith='42'))) ]) s = schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2')) assert len(clks) == 2 assert clks[0] == clks[1]
def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. schema_json = """ { "version": 2, "clkConfig": { "l": 1024, "kdf": { "type": "HKDF" } }, "features": [ { "identifier": "name", "format": { "type": "string", "encoding": "utf-8" }, "hashing": { "ngram": 2, "strategy": { "k": 20 }, "missingValue": { "sentinel": "null", "replaceWith": "Bob" } } }, { "identifier": "age", "format": { "type": "integer" }, "hashing": { "ngram": 1, "strategy": { "k": 20 }, "missingValue": { "sentinel": "NA", "replaceWith": "42" } } } ] } """ schema_dict = json.loads(schema_json) s = schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2')) assert len(clks) == 2 assert clks[0] == clks[1]
def test_describe(self): size = 1000 pii_data = randomnames.NameList(size) clks = generate_clks(pii_data.names, pii_data.SCHEMA, 'secret', validate=True) json_clks = json.dumps({'clks': clks}) plot(StringIO(json_clks)) # clkutil describe assert ' observations: {} '.format(size) in self.temp_std_out.getvalue()
def test_namelist_hashable(self): namelist = randomnames.NameList(1000) s1, s2 = namelist.generate_subsets(100, 0.8) self.assertEqual(len(s1), 100) self.assertEqual(len(s2), 100) schema = randomnames.NameList.SCHEMA keys = ('secret', 'sshh') bf1 = clk.generate_clks(s1, schema, keys) bf2 = clk.generate_clks(s2, schema, keys) self.assertEqual(len(bf1), 100) self.assertEqual(len(bf2), 100) # An "exact match" bloomfilter comparison: set1 = set(bf1) set2 = set(bf2) self.assertGreaterEqual( len(set1 & set2), 80, "Expected at least 80 hashes to be exactly the same")