def test_compare_to_legacy(self):
        # Identifier: 'ANY freetext'

        fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10)

        schema = Schema(
            l=1024,
            kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
            kdf_key_size=64,
            kdf_salt=base64.b64decode(
                'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
            fields=[
                StringSpec(identifier='ANY text {}'.format(i + 1),
                           hashing_properties=fhp) for i in range(4)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will
        # map each Bobby to different bits.
        self.assertLessEqual(legacy_count, fhp.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)
Beispiel #2
0
    def test_concurrent(self):
        f1 = tuple(
            map(
                itemgetter(0),
                bloomfilter.stream_bloom_filters(self.s1, self.key_lists,
                                                 self.nl.SCHEMA)))
        f2 = tuple(
            map(
                itemgetter(0),
                bloomfilter.stream_bloom_filters(self.s2, self.key_lists,
                                                 self.nl.SCHEMA)))

        threshold = 0.9
        candidate_pairs = anonlink.concurrency.process_chunk(
            [{
                "datasetIndex": 0,
                "range": [0, len(f1)]
            }, {
                "datasetIndex": 1,
                "range": [0, len(f2)]
            }], (f1, f2),
            anonlink.similarities.dice_coefficient,
            threshold,
            k=4)
        groups = anonlink.solving.greedy_solve(candidate_pairs)
        mapping = dict(anonlink.solving.pairs_from_groups(groups))

        self.check_accuracy(mapping)
Beispiel #3
0
    def test_xor_folding_integration(self):
        namelist = randomnames.NameList(1)
        schema_0 = namelist.SCHEMA
        assert schema_0.xor_folds == 0

        schema_1 = copy(schema_0)
        schema_1.xor_folds = 1
        schema_1.l //= 2

        key_lists = generate_key_lists('secret',
                                       len(namelist.schema_types))
        bf_original, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_0))
        bf_folded, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_1))

        self.assertEqual(
            bf_folded,
            bf_original[:len(bf_original) // 2]
                ^ bf_original[len(bf_original) // 2:],
            'Folded filter is not an XOR of the two halves of the original.')
Beispiel #4
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists('secret', len(nl.schema_types))
    filters1 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)))
    filters2 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)))

    return (s1, s2, filters1, filters2)
Beispiel #5
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        f1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        f2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))

        py_similarity = similarities.dice_coefficient_python(
            (f1, f2), self.default_threshold, self.default_k)
        c_similarity = similarities.dice_coefficient_accelerated(
            (f1, f2), self.default_threshold, self.default_k)
        self.assert_similarity_matrices_equal(py_similarity, c_similarity)
Beispiel #6
0
    def setup_class(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        cls.filters1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        cls.filters2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))
        cls.filters = cls.filters1, cls.filters2

        cls.default_k = 10
        cls.default_threshold = 0.5
Beispiel #7
0
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
    """Compare results and running time of python and C++ versions.

    :param ntotal: Total number of data points to generate
    :param nsubset: Number of points for each database
    :param frac: Fraction of overlap between subsets

    :raises: AssertionError if the results differ
    :return: dict with 'c' and 'python' keys with values of the total time taken
             for each implementation
    """

    nml = NameList(ntotal)
    sl1, sl2 = nml.generate_subsets(nsubset, frac)

    keys = generate_key_lists(('test1', 'test2'), len(nml.schema_types))
    filters1 = tuple(
        map(operator.itemgetter(0),
            stream_bloom_filters(sl1, keys, nml.SCHEMA)))
    filters2 = tuple(
        map(operator.itemgetter(0),
            stream_bloom_filters(sl2, keys, nml.SCHEMA)))

    # Pure Python version
    start = timer()
    result = anonlink.candidate_generation.find_candidate_pairs(
        (filters1, filters2),
        anonlink.similarities.dice_coefficient_python,
        0.0,
        k=1)
    end = timer()
    python_time = end - start

    # C++ accelerated version
    start = timer()
    result3 = anonlink.candidate_generation.find_candidate_pairs(
        (filters1, filters2),
        anonlink.similarities.dice_coefficient_accelerated,
        0.0,
        k=1)
    end = timer()
    cffi_time = end - start

    assert result == result3, "Results are different between C++ cffi and Python"

    # Results are the same
    return {"c": cffi_time, "python": python_time}
Beispiel #8
0
    def test_cffi_k(self):

        f1 = tuple(map(itemgetter(0),
                       bloomfilter.stream_bloom_filters(
                           self.s1, self.key_lists, self.nl.SCHEMA)))
        f2 = tuple(map(itemgetter(0),
                       bloomfilter.stream_bloom_filters(
                           self.s2, self.key_lists, self.nl.SCHEMA)))

        threshold = 0.9
        candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
            (f1, f2),
            anonlink.similarities.dice_coefficient_accelerated,
            threshold,
            k=4)
        groups = anonlink.solving.greedy_solve(candidate_pairs)
        mapping = dict(anonlink.solving.pairs_from_groups(groups))

        self.check_accuracy(mapping)
Beispiel #9
0
def hash_and_serialize_chunk(
        chunk_pii_data,  # type: Sequence[Sequence[str]]
        keys,  # type: Sequence[Sequence[bytes]]
        schema  # type: Schema
):
    # type: (...) -> Tuple[List[str], Sequence[int]]
    """
    Generate Bloom filters (ie hash) from chunks of PII then serialize
    the generated Bloom filters. It also computes and outputs the Hamming weight (or popcount) -- the number of bits
    set to one -- of the generated Bloom filters.

    :param chunk_pii_data: An iterable of indexable records.
    :param keys: A tuple of two lists of secret keys used in the HMAC.
    :param Schema schema: Schema specifying the entry formats and
            hashing settings.
    :return: A list of serialized Bloom filters and a list of corresponding popcounts
    """
    clk_data = []
    clk_popcounts = []
    for clk in stream_bloom_filters(chunk_pii_data, keys, schema):
        clk_data.append(serialize_bitarray(clk[0]).strip())
        clk_popcounts.append(clk[2])
    return clk_data, clk_popcounts
Beispiel #10
0
                    hashing_properties=FieldHashingProperties(
                        encoding=FieldHashingProperties._DEFAULT_ENCODING,
                        ngram=2,
                        positional=False,
                        weight=1),
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None)
            ])

        pii = [['Deckard']]
        keys = generate_key_lists(('secret', ), 1)

        schema.fields[0].hashing_properties.weight = 0
        bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 1
        bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 2
        bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 1.5
        bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        self.assertEqual(bf0[0].count(), 0)
        n1 = bf1[0].count()
        n2 = bf2[0].count()
        n15 = bf15[0].count()
        self.assertGreater(n1, 0)
Beispiel #11
0
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will map each Bobby to different bits.
        self.assertLessEqual(legacy_count,
                             schema.hashing_globals.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)

    def test_wrong_kdf(self):
        with self.assertRaises(ValueError):
            generate_key_lists([b'0'], 1, kdf='breakMe')

    def test_wrong_hash_function(self):
        with self.assertRaises(ValueError):