Example #1
0
    def setUp(self):
        super(CLITestHelper, self).setUp()
        self.pii_file = create_temp_file()
        self.pii_file_2 = create_temp_file()

        # Get random PII
        pii_data = randomnames.NameList(self.SAMPLES)
        data = [(name, dob) for _, name, dob, _ in pii_data.names]

        headers = ['NAME freetext', 'DOB YYYY/MM/DD']
        randomnames.save_csv(data, headers, self.pii_file)

        random.shuffle(data)
        randomnames.save_csv(data[::2], headers, self.pii_file_2)

        self.default_schema = [{
            "identifier": "INDEX"
        }, {
            "identifier": "NAME freetext"
        }, {
            "identifier": "DOB YYYY/MM/DD"
        }, {
            "identifier": "GENDER M or F"
        }]

        self.pii_file.close()
        self.pii_file_2.close()
Example #2
0
    def test_xor_folding_integration(self):
        namelist = randomnames.NameList(1)
        schema_0 = namelist.SCHEMA
        assert schema_0.xor_folds == 0

        schema_1 = copy(schema_0)
        schema_1.xor_folds = 1
        schema_1.l //= 2

        key_lists = generate_key_lists('secret',
                                       len(namelist.schema_types))
        bf_original, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_0))
        bf_folded, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_1))

        self.assertEqual(
            bf_folded,
            bf_original[:len(bf_original) // 2]
                ^ bf_original[len(bf_original) // 2:],
            'Folded filter is not an XOR of the two halves of the original.')
Example #3
0
    def setUp(self):
        self.pii_file = create_temp_file()

        pii_data = randomnames.NameList(TestHasherDefaultSchema.samples)
        randomnames.save_csv(pii_data.names,
                             [f.identifier for f in pii_data.SCHEMA.fields],
                             self.pii_file)
        self.pii_file.flush()
Example #4
0
 def test_generate_subsets(self):
     nl = rn.NameList(20)
     s1, s2 = nl.generate_subsets(10, 0.8)
     counteq = 0
     for s in s1:
         for t in s2:
             if s == t:
                 counteq += 1
     self.assertEqual(counteq, 8)
Example #5
0
 def test_compare_v1_and_v2(self):
     pii = randomnames.NameList(100).names
     schema_v1 = randomnames.NameList.SCHEMA
     # this v2 schema should be equivalent to the above v1 schema
     schema_v2 = _test_schema('randomnames-schema-v2.json')
     keys = ('secret', 'sshh')
     for clkv1, clkv2 in zip(clk.generate_clks(pii, schema_v1, keys),
                             clk.generate_clks(pii, schema_v2, keys)):
         self.assertEqual(clkv1, clkv2)
def create_test_data(entities, crossover=0.8, save_raw=True):
    """
    Uses the NameList data and schema and creates
    local files for raw data and clk data:

    - e1_NUM_raw.csv
    - e1_NUM.json
    - e2_NUM_raw.csv
    - e2_NUM.json

    :param bool save_raw: Set to False to skip saving raw files
    """
    print("Generating random test data for {} individuals".format(entities))

    from timeit import default_timer as timer

    t0 = timer()

    nl = randomnames.NameList(entities * 2)
    s1, s2 = nl.generate_subsets(entities, crossover)
    t1 = timer()
    print("generated data in {:.3f} s".format(t1 - t0))

    def save_subset_data(s, f):
        print(",".join(nl.schema), file=f)
        for entity in s:
            print(",".join(map(str, entity)), file=f)

    def save_filter_data(filters, f):
        print("Serializing filters")
        serialized_filters = serialize_filters(filters)

        json.dump(serialized_filters, f)

    keys = ('something', 'secret')

    if save_raw:
        with open("data/e1_{}_raw.csv".format(entities), "w") as f:
            save_subset_data(s1, f)

        with open("data/e2_{}_raw.csv".format(entities), "w") as f:
            save_subset_data(s2, f)
    t2 = timer()
    print("Saved raw data in {:.3f} s".format(t2 - t1))
    print("Locally hashing identity data to create bloom filters")

    # Save serialized filters
    with open("data/e1_{}.json".format(entities), 'w') as f1:
        save_filter_data(
            bloomfilter.calculate_bloom_filters(s1, nl.schema, keys), f1)

    with open("data/e2_{}.json".format(entities), 'w') as f2:
        save_filter_data(
            bloomfilter.calculate_bloom_filters(s2, nl.schema, keys), f2)

    t3 = timer()
    print("Hashed and serialized data in {:.3f} s".format(t3 - t2))
Example #7
0
    def test_describe(self):
        size = 1000
        pii_data = randomnames.NameList(size)

        clks = generate_clks(pii_data.names, pii_data.SCHEMA, 'secret', validate=True)
        json_clks = json.dumps({'clks': clks})

        plot(StringIO(json_clks))   # clkutil describe

        assert ' observations: {} '.format(size) in self.temp_std_out.getvalue()
Example #8
0
    def test_generate_large_subsets(self):
        nl = rn.NameList(2000)
        s1, s2 = nl.generate_subsets(1000, 0.5)
        counteq = 0
        for s in s1:
            for t in s2:
                if s[0] == t[0]:
                    counteq += 1

        self.assertEqual(counteq, 500)
Example #9
0
    def setUpClass(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        cls.filters1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        cls.filters2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)
Example #10
0
def generate(size, output, schema):
    """Generate fake PII data for testing"""
    pii_data = randomnames.NameList(size)

    if schema is not None:
        raise NotImplementedError

    randomnames.save_csv(pii_data.names,
                         [f.identifier for f in pii_data.SCHEMA.fields],
                         output)
Example #11
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
    filters1 = bloomfilter.calculate_bloom_filters(
        s1, schema.get_schema_types(nl.schema), keys)
    filters2 = bloomfilter.calculate_bloom_filters(
        s2, schema.get_schema_types(nl.schema), keys)

    return (s1, s2, filters1, filters2)
Example #12
0
 def test_generate_subsets_raises(self):
     # sz = 999
     # n = floor(sz * 1.2) = 1198
     # overlap = floor(0.8 * 999) = 799
     # notoverlap = sz - overlap = 200.
     # Thus sz + notoverlap = 1199 > n.
     sz = 999
     n = int(math.floor(sz * 1.2))
     names = rn.NameList(n)
     with pytest.raises(ValueError):
         s1, s2 = names.generate_subsets(sz, 0.8)
Example #13
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists('secret', len(nl.schema_types))
    filters1 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)))
    filters2 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)))

    return (s1, s2, filters1, filters2)
Example #14
0
 def test_compare_v1_v2_and_v3(self):
     pii = randomnames.NameList(100).names
     schema_v3 = randomnames.NameList.SCHEMA
     # this v2 schema should be equivalent to the above v3 schema
     schema_v2 = _test_schema('randomnames-schema-v2.json')
     schema_v1 = _test_schema('randomnames-schema-v1.json')
     secret = 'secret'
     for clkv1, clkv2, clkv3 in zip(clk.generate_clks(pii, schema_v1, secret),
                                    clk.generate_clks(pii, schema_v2, secret),
                                    clk.generate_clks(pii, schema_v3, secret)):
         self.assertEqual(clkv1, clkv2)
         self.assertEqual(clkv1, clkv3)
Example #15
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        f1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        f2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))

        py_similarity = similarities.dice_coefficient_python(
            (f1, f2), self.default_threshold, self.default_k)
        c_similarity = similarities.dice_coefficient_accelerated(
            (f1, f2), self.default_threshold, self.default_k)
        self.assert_similarity_matrices_equal(py_similarity, c_similarity)
Example #16
0
    def setup_class(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        cls.filters1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        cls.filters2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))
        cls.filters = cls.filters1, cls.filters2

        cls.default_k = 10
        cls.default_threshold = 0.5
Example #17
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)

        ps = entitymatch.python_filter_similarity(f1, f2)
        cs = entitymatch.cffi_filter_similarity_k(f1, f2, 1, 0.0)

        python_scores = [p[1] for p in ps]
        c_scores = [c[1] for c in cs]

        self.assertAlmostEqual(python_scores, c_scores)
Example #18
0
    def test_cffi_k(self):
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(150, 0.8)
        keys = ('test1', 'test2')
        key_lists = generate_key_lists(keys, len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), key_lists)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), key_lists)

        threshold = 0.8
        similarity = entitymatch.cffi_filter_similarity_k(f1, f2, 4, threshold)
        mapping = network_flow.map_entities(similarity,
                                            threshold=threshold,
                                            method=None)

        for indexA in mapping:
            self.assertEqual(s1[indexA], s2[mapping[indexA]])
Example #19
0
    def test_hashing_json_schema(self):
        runner = CliRunner()

        pii_data = randomnames.NameList(self.SAMPLES)
        pii_file = create_temp_file()
        randomnames.save_csv(pii_data.names,
                             [f.identifier for f in pii_data.SCHEMA.fields],
                             pii_file)
        pii_file.close()

        with temporary_file() as output_filename:
            with open(output_filename) as output:
                cli_result = runner.invoke(
                    cli.cli,
                    ['hash', pii_file.name, 'secret', RANDOMNAMES_SCHEMA_PATH, output.name])

            self.assertEqual(cli_result.exit_code, 0, msg=cli_result.output)

            with open(output_filename) as output:
                self.assertIn('clks', json.load(output))
Example #20
0
    def test_namelist_hashable(self):
        namelist = randomnames.NameList(1000)
        s1, s2 = namelist.generate_subsets(100, 0.8)

        self.assertEqual(len(s1), 100)
        self.assertEqual(len(s2), 100)

        schema = randomnames.NameList.SCHEMA
        keys = ('secret', 'sshh')

        bf1 = clk.generate_clks(s1, schema, keys)
        bf2 = clk.generate_clks(s2, schema, keys)

        self.assertEqual(len(bf1), 100)
        self.assertEqual(len(bf2), 100)

        # An "exact match" bloomfilter comparison:
        set1 = set(bf1)
        set2 = set(bf2)

        self.assertGreaterEqual(
            len(set1 & set2), 80,
            "Expected at least 80 hashes to be exactly the same")
Example #21
0
 def setUp(self):
     self.nl = randomnames.NameList(300)
     self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion)
     self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
Example #22
0
 def test_generate_subsets_raises(self):
     names = rn.NameList(15)
     with pytest.raises(ValueError):
         names.generate_subsets(10, 0.8, subsets=5)
Example #23
0
    def test_generate_large_subsets(self):
        nl = rn.NameList(5000)
        subsets = map(set, nl.generate_subsets(1000, 0.5, subsets=3))

        for s1, s2 in itertools.combinations(subsets, 2):
            self.assertEqual(len(s1 & s2), 500, msg='unexpected overlap size')
Example #24
0
 def setUp(self):
     self.nl = randomnames.NameList(300)
     self.s1, self.s2 = self.nl.generate_subsets(self.sample,
                                                 self.proportion)
     keys = ('test1', 'test2')
     self.key_lists = generate_key_lists(keys, len(self.nl.schema_types))