Example #1
0
    def test_compare_to_legacy(self):
        # Identifier: 'ANY freetext'

        fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10)

        schema = Schema(
            l=1024,
            kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
            kdf_key_size=64,
            kdf_salt=base64.b64decode(
                'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
            fields=[
                StringSpec(identifier='ANY text {}'.format(i + 1),
                           hashing_properties=fhp) for i in range(4)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will
        # map each Bobby to different bits.
        self.assertLessEqual(legacy_count, fhp.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)
Example #2
0
    def test_xor_folding_integration(self):
        namelist = randomnames.NameList(1)
        schema_0 = namelist.SCHEMA
        assert schema_0.xor_folds == 0

        schema_1 = copy(schema_0)
        schema_1.xor_folds = 1
        schema_1.l //= 2

        key_lists = generate_key_lists('secret',
                                       len(namelist.schema_types))
        bf_original, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_0))
        bf_folded, _, _ = next(bloomfilter.stream_bloom_filters(
            namelist.names,
            key_lists,
            schema_1))

        self.assertEqual(
            bf_folded,
            bf_original[:len(bf_original) // 2]
                ^ bf_original[len(bf_original) // 2:],
            'Folded filter is not an XOR of the two halves of the original.')
 def test_generate_key_lists(self):
     secret = "No, I am your father. No... that's not true! That's impossible!".encode(
     )
     for num_keys in (1, 10):
         key_lists = generate_key_lists(secret, num_keys)
         self._test_key_lists(key_lists, num_keys,
                              DEFAULT_NUM_HASHING_METHODS)
Example #4
0
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
    """Compare results and running time of python and C++ versions.

    :param ntotal: Total number of data points to generate
    :param nsubset: Number of points for each database
    :param frac: Fraction of overlap between subsets

    :raises: AssertionError if the results differ
    :return: dict with 'c' and 'python' keys with values of the total time taken
             for each implementation
    """

    nml = NameList(ntotal)
    sl1, sl2 = nml.generate_subsets(nsubset, frac)

    keys = generate_key_lists(('test1', 'test2'), len(nml.schema))
    filters1 = calculate_bloom_filters(sl1, get_schema_types(nml.schema), keys)
    filters2 = calculate_bloom_filters(sl2, get_schema_types(nml.schema), keys)

    # Pure Python version
    start = timer()
    result = python_filter_similarity(filters1, filters2)
    end = timer()
    python_time = end - start

    # C++ cffi version
    start = timer()
    result3 = cffi_filter_similarity_k(filters1, filters2, 1, 0.0)
    end = timer()
    cffi_time = end - start

    assert result == result3, "Results are different between C++ cffi and Python"

    # Results are the same
    return {"c": cffi_time, "python": python_time}
 def test_generate_key_lists_num_hashes(self):
     secret = "No, I am your father. No... that's not true! That's impossible!".encode(
     )
     num_keys = 10
     for num_hashing_methods in (1, 10):
         key_lists = generate_key_lists(
             secret, num_keys, num_hashing_methods=num_hashing_methods)
         self._test_key_lists(key_lists, num_keys, num_hashing_methods)
Example #6
0
    def setUpClass(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        cls.filters1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        cls.filters2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)
Example #7
0
def generate_clks(
    pii_data,  # type: Sequence[Sequence[str]]
    schema,  # type: Schema
    secret,  # type: AnyStr
    validate=True,  # type: bool
    callback=None,  # type: Optional[Callable[[int, Sequence[int]], None]]
    use_multiprocessing=True  # type: bool
):
    # type: (...) -> List[str]

    # Generate two keys for each identifier from the secret, one key per hashing method used when computing
    # the bloom filters.
    # Otherwise it could create more if required using the parameter `num_hashing_methods` in `generate_key_lists`
    key_lists = generate_key_lists(secret,
                                   len(schema.fields),
                                   key_size=schema.kdf_key_size,
                                   salt=schema.kdf_salt,
                                   info=schema.kdf_info,
                                   kdf=schema.kdf_type,
                                   hash_algo=schema.kdf_hash)

    if validate:
        validate_entries(schema.fields, pii_data)

    # Chunks PII
    log.info("Hashing {} entities".format(len(pii_data)))
    chunk_size = 200 if len(pii_data) <= 10000 else 1000
    futures = []

    # Compute Bloom filter from the chunks and then serialise it
    pool_executor = ProcessPoolExecutor if use_multiprocessing else \
        ThreadPoolExecutor # type: Union[Type[ProcessPoolExecutor], Type[ThreadPoolExecutor]]

    with pool_executor() as executor:
        for chunk in chunks(pii_data, chunk_size):
            future = executor.submit(
                hash_and_serialize_chunk,
                chunk,
                key_lists,
                schema,
            )
            if callback is not None:
                unpacked_callback = cast(Callable[[int, Sequence[int]], None],
                                         callback)
                future.add_done_callback(
                    lambda f: unpacked_callback(len(f.result()[0]),
                                                f.result()[1]))
            futures.append(future)

        results = []
        for future in futures:
            clks, clk_stats = future.result()
            results.extend(clks)

    return results
Example #8
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists('secret', len(nl.schema_types))
    filters1 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)))
    filters2 = list(map(itemgetter(0),
                    bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)))

    return (s1, s2, filters1, filters2)
Example #9
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
    filters1 = bloomfilter.calculate_bloom_filters(
        s1, schema.get_schema_types(nl.schema), keys)
    filters2 = bloomfilter.calculate_bloom_filters(
        s2, schema.get_schema_types(nl.schema), keys)

    return (s1, s2, filters1, filters2)
Example #10
0
def generate_clks(
    pii_data,  # type: Sequence[Sequence[str]]
    schema,  # type: Schema
    keys,  # type: Tuple[AnyStr, AnyStr]
    validate=True,  # type: bool
    callback=None  # type: Optional[Callable[[int, Sequence[int]], None]]
):
    # type: (...) -> List[str]

    # generate two keys for each identifier
    key_lists = generate_key_lists(
        keys,
        len(schema.fields),
        key_size=schema.hashing_globals.kdf_key_size,
        salt=schema.hashing_globals.kdf_salt,
        info=schema.hashing_globals.kdf_info,
        kdf=schema.hashing_globals.kdf_type,
        hash_algo=schema.hashing_globals.kdf_hash)

    if validate:
        validate_entries(schema.fields, pii_data)

    # Chunks PII
    log.info("Hashing {} entities".format(len(pii_data)))
    chunk_size = 200 if len(pii_data) <= 10000 else 1000
    futures = []

    # Compute Bloom filter from the chunks and then serialise it
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for chunk in chunks(pii_data, chunk_size):
            future = executor.submit(
                hash_and_serialize_chunk,
                chunk,
                key_lists,
                schema,
            )
            if callback is not None:
                unpacked_callback = cast(Callable[[int, Sequence[int]], None],
                                         callback)
                future.add_done_callback(
                    lambda f: unpacked_callback(len(f.result()[0]),
                                                f.result()[1]))
            futures.append(future)

        results = []
        for future in futures:
            clks, clk_stats = future.result()
            results.extend(clks)

    return results
Example #11
0
 def test_generate_key_lists(self):
     master_secrets = [
         'No, I am your father'.encode(),
         "No... that's not true! That's impossible!".encode()
     ]
     for num_keys in (1, 10):
         key_lists = generate_key_lists(master_secrets, num_keys)
         self.assertEqual(len(key_lists), num_keys)
         for l in key_lists:
             self.assertEqual(len(l), len(master_secrets))
         for key in key_lists[0]:
             self.assertEqual(len(key),
                              DEFAULT_KEY_SIZE,
                              msg='key should be of size '
                              '"default_key_size"')
Example #12
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        f1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        f2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))

        py_similarity = similarities.dice_coefficient_python(
            (f1, f2), self.default_threshold, self.default_k)
        c_similarity = similarities.dice_coefficient_accelerated(
            (f1, f2), self.default_threshold, self.default_k)
        self.assert_similarity_matrices_equal(py_similarity, c_similarity)
Example #13
0
    def setup_class(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types))
        cls.filters1 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))
        cls.filters2 = tuple(
            f[0]
            for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))
        cls.filters = cls.filters1, cls.filters2

        cls.default_k = 10
        cls.default_threshold = 0.5
Example #14
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)

        ps = entitymatch.python_filter_similarity(f1, f2)
        cs = entitymatch.cffi_filter_similarity_k(f1, f2, 1, 0.0)

        python_scores = [p[1] for p in ps]
        c_scores = [c[1] for c in cs]

        self.assertAlmostEqual(python_scores, c_scores)
Example #15
0
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
    """Compare results and running time of python and C++ versions.

    :param ntotal: Total number of data points to generate
    :param nsubset: Number of points for each database
    :param frac: Fraction of overlap between subsets

    :raises: AssertionError if the results differ
    :return: dict with 'c' and 'python' keys with values of the total time taken
             for each implementation
    """

    nml = NameList(ntotal)
    sl1, sl2 = nml.generate_subsets(nsubset, frac)

    keys = generate_key_lists(('test1', 'test2'), len(nml.schema_types))
    filters1 = tuple(
        map(operator.itemgetter(0),
            stream_bloom_filters(sl1, keys, nml.SCHEMA)))
    filters2 = tuple(
        map(operator.itemgetter(0),
            stream_bloom_filters(sl2, keys, nml.SCHEMA)))

    # Pure Python version
    start = timer()
    result = anonlink.candidate_generation.find_candidate_pairs(
        (filters1, filters2),
        anonlink.similarities.dice_coefficient_python,
        0.0,
        k=1)
    end = timer()
    python_time = end - start

    # C++ accelerated version
    start = timer()
    result3 = anonlink.candidate_generation.find_candidate_pairs(
        (filters1, filters2),
        anonlink.similarities.dice_coefficient_accelerated,
        0.0,
        k=1)
    end = timer()
    cffi_time = end - start

    assert result == result3, "Results are different between C++ cffi and Python"

    # Results are the same
    return {"c": cffi_time, "python": python_time}
Example #16
0
    def test_cffi_k(self):
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(150, 0.8)
        keys = ('test1', 'test2')
        key_lists = generate_key_lists(keys, len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), key_lists)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), key_lists)

        threshold = 0.8
        similarity = entitymatch.cffi_filter_similarity_k(f1, f2, 4, threshold)
        mapping = network_flow.map_entities(similarity,
                                            threshold=threshold,
                                            method=None)

        for indexA in mapping:
            self.assertEqual(s1[indexA], s2[mapping[indexA]])
Example #17
0
 def setUp(self):
     self.nl = randomnames.NameList(300)
     self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion)
     self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
Example #18
0
 def test_wrong_kdf(self):
     with self.assertRaises(ValueError):
         generate_key_lists([b'0'], 1, kdf='breakMe')
Example #19
0
                    identifier='some info',
                    hashing_properties=FieldHashingProperties(
                        encoding=FieldHashingProperties._DEFAULT_ENCODING,
                        comparator=bigram_tokenizer,
                        strategy=BitsPerTokenStrategy(20)
                    ),
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None
                )
            ]
        )

        pii = [['Deckard']]
        keys = generate_key_lists('secret', 1)

        schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(0)
        bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(20)
        bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(40)
        bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(30)
        bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        self.assertEqual(bf0[0].count(), 0)
        n1 = bf1[0].count()
Example #20
0
 def test_fail_generate_key_lists(self):
     with self.assertRaises(TypeError):
         generate_key_lists([True, False], 10)
Example #21
0
                        encoding=FieldHashingProperties._DEFAULT_ENCODING,
                        ngram=2,
                        positional=False,
                        weight=1),
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will map each Bobby to different bits.
        self.assertLessEqual(legacy_count,
                             schema.hashing_globals.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)

    def test_wrong_kdf(self):
        with self.assertRaises(ValueError):
Example #22
0
            fields=[
                StringSpec(
                    identifier='some info',
                    hashing_properties=FieldHashingProperties(
                        encoding=FieldHashingProperties._DEFAULT_ENCODING,
                        ngram=2,
                        positional=False,
                        weight=1),
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None)
            ])

        pii = [['Deckard']]
        keys = generate_key_lists(('secret', ), 1)

        schema.fields[0].hashing_properties.weight = 0
        bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 1
        bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 2
        bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.weight = 1.5
        bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        self.assertEqual(bf0[0].count(), 0)
        n1 = bf1[0].count()
Example #23
0
 def setUp(self):
     self.nl = randomnames.NameList(300)
     self.s1, self.s2 = self.nl.generate_subsets(self.sample,
                                                 self.proportion)
     keys = ('test1', 'test2')
     self.key_lists = generate_key_lists(keys, len(self.nl.schema_types))
Example #24
0
 def test_wrong_num_hashing_methods(self):
     with self.assertRaises(ValueError):
         secret = "No, I am your father. No... that's not true! That's impossible!".encode(
         )
         generate_key_lists(secret, 10, num_hashing_methods=0)
Example #25
0
            fields=[
                StringSpec(
                    identifier='some info',
                    hashing_properties=FieldHashingProperties(
                        encoding=FieldHashingProperties._DEFAULT_ENCODING,
                        ngram=2,
                        positional=False,
                        k=20),
                    description=None,
                    case=StringSpec._DEFAULT_CASE,
                    min_length=0,
                    max_length=None)
            ])

        pii = [['Deckard']]
        keys = generate_key_lists(('secret', ), 1)

        schema.fields[0].hashing_properties.k = 0
        bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.k = 20
        bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.k = 40
        bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        schema.fields[0].hashing_properties.k = 30
        bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema))

        self.assertEqual(bf0[0].count(), 0)
        n1 = bf1[0].count()