Ejemplo n.º 1
0
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
    """Compare results and running time of python and C++ versions.

    :param ntotal: Total number of data points to generate
    :param nsubset: Number of points for each database
    :param frac: Fraction of overlap between subsets

    :raises: AssertionError if the results differ
    :return: dict with 'c' and 'python' keys with values of the total time taken
             for each implementation
    """

    nml = NameList(ntotal)
    sl1, sl2 = nml.generate_subsets(nsubset, frac)

    keys = generate_key_lists(('test1', 'test2'), len(nml.schema))
    filters1 = calculate_bloom_filters(sl1, get_schema_types(nml.schema), keys)
    filters2 = calculate_bloom_filters(sl2, get_schema_types(nml.schema), keys)

    # Pure Python version
    start = timer()
    result = python_filter_similarity(filters1, filters2)
    end = timer()
    python_time = end - start

    # C++ cffi version
    start = timer()
    result3 = cffi_filter_similarity_k(filters1, filters2, 1, 0.0)
    end = timer()
    cffi_time = end - start

    assert result == result3, "Results are different between C++ cffi and Python"

    # Results are the same
    return {"c": cffi_time, "python": python_time}
def create_test_data(entities, crossover=0.8, save_raw=True):
    """
    Uses the NameList data and schema and creates
    local files for raw data and clk data:

    - e1_NUM_raw.csv
    - e1_NUM.json
    - e2_NUM_raw.csv
    - e2_NUM.json

    :param bool save_raw: Set to False to skip saving raw files
    """
    print("Generating random test data for {} individuals".format(entities))

    from timeit import default_timer as timer

    t0 = timer()

    nl = randomnames.NameList(entities * 2)
    s1, s2 = nl.generate_subsets(entities, crossover)
    t1 = timer()
    print("generated data in {:.3f} s".format(t1 - t0))

    def save_subset_data(s, f):
        print(",".join(nl.schema), file=f)
        for entity in s:
            print(",".join(map(str, entity)), file=f)

    def save_filter_data(filters, f):
        print("Serializing filters")
        serialized_filters = serialize_filters(filters)

        json.dump(serialized_filters, f)

    keys = ('something', 'secret')

    if save_raw:
        with open("data/e1_{}_raw.csv".format(entities), "w") as f:
            save_subset_data(s1, f)

        with open("data/e2_{}_raw.csv".format(entities), "w") as f:
            save_subset_data(s2, f)
    t2 = timer()
    print("Saved raw data in {:.3f} s".format(t2 - t1))
    print("Locally hashing identity data to create bloom filters")

    # Save serialized filters
    with open("data/e1_{}.json".format(entities), 'w') as f1:
        save_filter_data(
            bloomfilter.calculate_bloom_filters(s1, nl.schema, keys), f1)

    with open("data/e2_{}.json".format(entities), 'w') as f2:
        save_filter_data(
            bloomfilter.calculate_bloom_filters(s2, nl.schema, keys), f2)

    t3 = timer()
    print("Hashed and serialized data in {:.3f} s".format(t3 - t2))
Ejemplo n.º 3
0
    def setUpClass(cls):
        cls.proportion = 0.8
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(200, cls.proportion)

        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        cls.filters1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        cls.filters2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)
Ejemplo n.º 4
0
def generate_data(samples, proportion=0.75):
    nl = randomnames.NameList(samples * 2)
    s1, s2 = nl.generate_subsets(samples, proportion)

    keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
    filters1 = bloomfilter.calculate_bloom_filters(
        s1, schema.get_schema_types(nl.schema), keys)
    filters2 = bloomfilter.calculate_bloom_filters(
        s2, schema.get_schema_types(nl.schema), keys)

    return (s1, s2, filters1, filters2)
Ejemplo n.º 5
0
    def test_cffi_manual(self):
        nl = randomnames.NameList(30)
        s1, s2 = nl.generate_subsets(5, 1.0)
        keys = generate_key_lists(('test1', 'test2'), len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), keys)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), keys)

        ps = entitymatch.python_filter_similarity(f1, f2)
        cs = entitymatch.cffi_filter_similarity_k(f1, f2, 1, 0.0)

        python_scores = [p[1] for p in ps]
        c_scores = [c[1] for c in cs]

        self.assertAlmostEqual(python_scores, c_scores)
Ejemplo n.º 6
0
    def test_cffi_k(self):
        nl = randomnames.NameList(300)
        s1, s2 = nl.generate_subsets(150, 0.8)
        keys = ('test1', 'test2')
        key_lists = generate_key_lists(keys, len(nl.schema))
        f1 = bloomfilter.calculate_bloom_filters(
            s1, schema.get_schema_types(nl.schema), key_lists)
        f2 = bloomfilter.calculate_bloom_filters(
            s2, schema.get_schema_types(nl.schema), key_lists)

        threshold = 0.8
        similarity = entitymatch.cffi_filter_similarity_k(f1, f2, 4, threshold)
        mapping = network_flow.map_entities(similarity,
                                            threshold=threshold,
                                            method=None)

        for indexA in mapping:
            self.assertEqual(s1[indexA], s2[mapping[indexA]])