def setUp(self): super(CLITestHelper, self).setUp() self.pii_file = create_temp_file() self.pii_file_2 = create_temp_file() # Get random PII pii_data = randomnames.NameList(self.SAMPLES) data = [(name, dob) for _, name, dob, _ in pii_data.names] headers = ['NAME freetext', 'DOB YYYY/MM/DD'] randomnames.save_csv(data, headers, self.pii_file) random.shuffle(data) randomnames.save_csv(data[::2], headers, self.pii_file_2) self.default_schema = [{ "identifier": "INDEX" }, { "identifier": "NAME freetext" }, { "identifier": "DOB YYYY/MM/DD" }, { "identifier": "GENDER M or F" }] self.pii_file.close() self.pii_file_2.close()
def test_xor_folding_integration(self): namelist = randomnames.NameList(1) schema_0 = namelist.SCHEMA assert schema_0.xor_folds == 0 schema_1 = copy(schema_0) schema_1.xor_folds = 1 schema_1.l //= 2 key_lists = generate_key_lists('secret', len(namelist.schema_types)) bf_original, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_0)) bf_folded, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_1)) self.assertEqual( bf_folded, bf_original[:len(bf_original) // 2] ^ bf_original[len(bf_original) // 2:], 'Folded filter is not an XOR of the two halves of the original.')
def setUp(self): self.pii_file = create_temp_file() pii_data = randomnames.NameList(TestHasherDefaultSchema.samples) randomnames.save_csv(pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], self.pii_file) self.pii_file.flush()
def test_generate_subsets(self): nl = rn.NameList(20) s1, s2 = nl.generate_subsets(10, 0.8) counteq = 0 for s in s1: for t in s2: if s == t: counteq += 1 self.assertEqual(counteq, 8)
def test_compare_v1_and_v2(self): pii = randomnames.NameList(100).names schema_v1 = randomnames.NameList.SCHEMA # this v2 schema should be equivalent to the above v1 schema schema_v2 = _test_schema('randomnames-schema-v2.json') keys = ('secret', 'sshh') for clkv1, clkv2 in zip(clk.generate_clks(pii, schema_v1, keys), clk.generate_clks(pii, schema_v2, keys)): self.assertEqual(clkv1, clkv2)
def create_test_data(entities, crossover=0.8, save_raw=True): """ Uses the NameList data and schema and creates local files for raw data and clk data: - e1_NUM_raw.csv - e1_NUM.json - e2_NUM_raw.csv - e2_NUM.json :param bool save_raw: Set to False to skip saving raw files """ print("Generating random test data for {} individuals".format(entities)) from timeit import default_timer as timer t0 = timer() nl = randomnames.NameList(entities * 2) s1, s2 = nl.generate_subsets(entities, crossover) t1 = timer() print("generated data in {:.3f} s".format(t1 - t0)) def save_subset_data(s, f): print(",".join(nl.schema), file=f) for entity in s: print(",".join(map(str, entity)), file=f) def save_filter_data(filters, f): print("Serializing filters") serialized_filters = serialize_filters(filters) json.dump(serialized_filters, f) keys = ('something', 'secret') if save_raw: with open("data/e1_{}_raw.csv".format(entities), "w") as f: save_subset_data(s1, f) with open("data/e2_{}_raw.csv".format(entities), "w") as f: save_subset_data(s2, f) t2 = timer() print("Saved raw data in {:.3f} s".format(t2 - t1)) print("Locally hashing identity data to create bloom filters") # Save serialized filters with open("data/e1_{}.json".format(entities), 'w') as f1: save_filter_data( bloomfilter.calculate_bloom_filters(s1, nl.schema, keys), f1) with open("data/e2_{}.json".format(entities), 'w') as f2: save_filter_data( bloomfilter.calculate_bloom_filters(s2, nl.schema, keys), f2) t3 = timer() print("Hashed and serialized data in {:.3f} s".format(t3 - t2))
def test_describe(self): size = 1000 pii_data = randomnames.NameList(size) clks = generate_clks(pii_data.names, pii_data.SCHEMA, 'secret', validate=True) json_clks = json.dumps({'clks': clks}) plot(StringIO(json_clks)) # clkutil describe assert ' observations: {} '.format(size) in self.temp_std_out.getvalue()
def test_generate_large_subsets(self): nl = rn.NameList(2000) s1, s2 = nl.generate_subsets(1000, 0.5) counteq = 0 for s in s1: for t in s2: if s[0] == t[0]: counteq += 1 self.assertEqual(counteq, 500)
def setUpClass(cls): cls.proportion = 0.8 nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(200, cls.proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) cls.filters1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) cls.filters2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys)
def generate(size, output, schema): """Generate fake PII data for testing""" pii_data = randomnames.NameList(size) if schema is not None: raise NotImplementedError randomnames.save_csv(pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], output)
def generate_data(samples, proportion=0.75): nl = randomnames.NameList(samples * 2) s1, s2 = nl.generate_subsets(samples, proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) filters1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) filters2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys) return (s1, s2, filters1, filters2)
def test_generate_subsets_raises(self): # sz = 999 # n = floor(sz * 1.2) = 1198 # overlap = floor(0.8 * 999) = 799 # notoverlap = sz - overlap = 200. # Thus sz + notoverlap = 1199 > n. sz = 999 n = int(math.floor(sz * 1.2)) names = rn.NameList(n) with pytest.raises(ValueError): s1, s2 = names.generate_subsets(sz, 0.8)
def generate_data(samples, proportion=0.75): nl = randomnames.NameList(samples * 2) s1, s2 = nl.generate_subsets(samples, proportion) keys = generate_key_lists('secret', len(nl.schema_types)) filters1 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))) filters2 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))) return (s1, s2, filters1, filters2)
def test_compare_v1_v2_and_v3(self): pii = randomnames.NameList(100).names schema_v3 = randomnames.NameList.SCHEMA # this v2 schema should be equivalent to the above v3 schema schema_v2 = _test_schema('randomnames-schema-v2.json') schema_v1 = _test_schema('randomnames-schema-v1.json') secret = 'secret' for clkv1, clkv2, clkv3 in zip(clk.generate_clks(pii, schema_v1, secret), clk.generate_clks(pii, schema_v2, secret), clk.generate_clks(pii, schema_v3, secret)): self.assertEqual(clkv1, clkv2) self.assertEqual(clkv1, clkv3)
def test_cffi_manual(self): nl = randomnames.NameList(30) s1, s2 = nl.generate_subsets(5, 1.0) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) f1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) f2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) py_similarity = similarities.dice_coefficient_python( (f1, f2), self.default_threshold, self.default_k) c_similarity = similarities.dice_coefficient_accelerated( (f1, f2), self.default_threshold, self.default_k) self.assert_similarity_matrices_equal(py_similarity, c_similarity)
def setup_class(cls): cls.proportion = 0.8 nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(200, cls.proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) cls.filters1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) cls.filters2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) cls.filters = cls.filters1, cls.filters2 cls.default_k = 10 cls.default_threshold = 0.5
def test_cffi_manual(self): nl = randomnames.NameList(30) s1, s2 = nl.generate_subsets(5, 1.0) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) f1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) f2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys) ps = entitymatch.python_filter_similarity(f1, f2) cs = entitymatch.cffi_filter_similarity_k(f1, f2, 1, 0.0) python_scores = [p[1] for p in ps] c_scores = [c[1] for c in cs] self.assertAlmostEqual(python_scores, c_scores)
def test_cffi_k(self): nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(150, 0.8) keys = ('test1', 'test2') key_lists = generate_key_lists(keys, len(nl.schema)) f1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), key_lists) f2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), key_lists) threshold = 0.8 similarity = entitymatch.cffi_filter_similarity_k(f1, f2, 4, threshold) mapping = network_flow.map_entities(similarity, threshold=threshold, method=None) for indexA in mapping: self.assertEqual(s1[indexA], s2[mapping[indexA]])
def test_hashing_json_schema(self): runner = CliRunner() pii_data = randomnames.NameList(self.SAMPLES) pii_file = create_temp_file() randomnames.save_csv(pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], pii_file) pii_file.close() with temporary_file() as output_filename: with open(output_filename) as output: cli_result = runner.invoke( cli.cli, ['hash', pii_file.name, 'secret', RANDOMNAMES_SCHEMA_PATH, output.name]) self.assertEqual(cli_result.exit_code, 0, msg=cli_result.output) with open(output_filename) as output: self.assertIn('clks', json.load(output))
def test_namelist_hashable(self): namelist = randomnames.NameList(1000) s1, s2 = namelist.generate_subsets(100, 0.8) self.assertEqual(len(s1), 100) self.assertEqual(len(s2), 100) schema = randomnames.NameList.SCHEMA keys = ('secret', 'sshh') bf1 = clk.generate_clks(s1, schema, keys) bf2 = clk.generate_clks(s2, schema, keys) self.assertEqual(len(bf1), 100) self.assertEqual(len(bf2), 100) # An "exact match" bloomfilter comparison: set1 = set(bf1) set2 = set(bf2) self.assertGreaterEqual( len(set1 & set2), 80, "Expected at least 80 hashes to be exactly the same")
def setUp(self): self.nl = randomnames.NameList(300) self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion) self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
def test_generate_subsets_raises(self): names = rn.NameList(15) with pytest.raises(ValueError): names.generate_subsets(10, 0.8, subsets=5)
def test_generate_large_subsets(self): nl = rn.NameList(5000) subsets = map(set, nl.generate_subsets(1000, 0.5, subsets=3)) for s1, s2 in itertools.combinations(subsets, 2): self.assertEqual(len(s1 & s2), 500, msg='unexpected overlap size')
def setUp(self): self.nl = randomnames.NameList(300) self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion) keys = ('test1', 'test2') self.key_lists = generate_key_lists(keys, len(self.nl.schema_types))