Example #1
0
def main(path_to_serialized_model):
    print('Loading: ' + str(path_to_serialized_model))
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    api = API(network)
    api.init_store()
    ip_shell = InteractiveShellEmbed(banner1=init_banner, exit_msg=exit_banner)
    ip_shell()
Example #2
0
def test():
    # Fixed graph density, differing sizes (nodes)

    fn = syn.generate_network_with(num_nodes=100,
                                   num_nodes_per_table=10,
                                   num_schema_sim=200,
                                   num_content_sim=150,
                                   num_pkfk=50)
    api = API(fn)

    nodes = fn.fields_degree(3)
    nids = [x for x, y in nodes]
    info = fn.get_info_for(nids)
    hits = fn.get_hits_from_info(info)
    in_drs = api.drs_from_hits(hits)

    q2, q3, q4 = run_all_queries(100, api_obj=api, in_drs_obj=in_drs)

    nq2 = np.array(q2)
    p5 = np.percentile(nq2, 5)
    p50 = np.percentile(nq2, 50)
    p95 = np.percentile(nq2, 95)
    print("q2: " + str(p5) + " - " + str(p50) + " - " + str(p95))

    nq3 = np.array(q3)
    p5 = np.percentile(nq3, 5)
    p50 = np.percentile(nq3, 50)
    p95 = np.percentile(nq3, 95)
    print("q3: " + str(p5) + " - " + str(p50) + " - " + str(p95))

    nq4 = np.array(q4)
    p5 = np.percentile(nq4, 5)
    p50 = np.percentile(nq4, 50)
    p95 = np.percentile(nq4, 95)
    print("q4: " + str(p5) + " - " + str(p50) + " - " + str(p95))
Example #3
0
def experiment_changing_input_size(repetitions=100):

    # Create a graph

    fn = syn.generate_network_with(num_nodes=100000,
                                   num_nodes_per_table=10,
                                   num_schema_sim=90000,
                                   num_content_sim=90000,
                                   num_pkfk=90000)

    api = API(fn)

    perf_results = dict()

    # input size from 1 to 100
    for i in range(50):
        i = i + 1
        nodes = fn.fields_degree(i)
        nids = [x for x, y in nodes]
        info = fn.get_info_for(nids)
        hits = fn.get_hits_from_info(info)
        in_drs = api.drs_from_hits(hits)

        q2, q3, q4 = run_all_queries(repetitions,
                                     api_obj=api,
                                     in_drs_obj=in_drs)
        percentile_results = get_percentiles([q2, q3, q4])
        perf_results[i] = percentile_results
    return perf_results
Example #4
0
def experiment_changing_graph_density_constant_size(repetitions=10):
    size = 100000
    densities = [100, 1000, 10000, 100000, 1000000]
    perf_results = dict()
    for density in densities:
        fn = syn.generate_network_with(num_nodes=size,
                                       num_nodes_per_table=10,
                                       num_schema_sim=density,
                                       num_content_sim=density,
                                       num_pkfk=density)

        api = API(fn)

        nodes = fn.fields_degree(3)
        nids = [x for x, y in nodes]
        info = fn.get_info_for(nids)
        hits = fn.get_hits_from_info(info)
        in_drs = api.drs_from_hits(hits)

        q2, q3, q4 = run_all_queries(repetitions,
                                     api_obj=api,
                                     in_drs_obj=in_drs)
        percentile_results = get_percentiles([q2, q3, q4])
        perf_results[density] = percentile_results

    return perf_results
Example #5
0
def experiment_changing_max_hops_tc_queries(repetitions=100):
    perf_results = dict()
    for i in range(10):
        i = i + 1
        fn = syn.generate_network_with(num_nodes=100000,
                                       num_nodes_per_table=10,
                                       num_schema_sim=100000,
                                       num_content_sim=100000,
                                       num_pkfk=100000)

        api = API(fn)

        nodes = fn.fields_degree(1)
        nids = [x for x, y in nodes]
        info = fn.get_info_for(nids)
        hits = fn.get_hits_from_info(info)
        in_drs = api.drs_from_hits(hits)

        query_times = []
        for repet in range(repetitions):
            s = time.time()
            res = api.traverse(in_drs, Relation.SCHEMA_SIM, max_hops=i)
            e = time.time()
            query_times.append((e - s))

        percentile_results = get_percentiles([query_times])
        perf_results[i] = percentile_results
    return perf_results
Example #6
0
    def test_ranking_certainty_chem(self):
        path = '../models/chemical/'
        network = deserialize_network(path)
        api = API(network)
        api.init_store()

        table = 'activities'
        table_drs = api.drs_from_table(table)

        sim_tables = api.similar_content_to(table_drs)

        sim_tables.rank_certainty()

        print("All columns CERTAINTY: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables CERTAINTY: ")
        sim_tables.print_tables_with_scores()
        print("")

        sim_tables.rank_coverage()

        print("All columns COVERAGE: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables COVERAGE: ")
        sim_tables.print_tables_with_scores()
        print("")
Example #7
0
def init_system(path_to_serialized_model, reporting=True):
    print_md('Loading: *' + str(path_to_serialized_model) + "*")
    sl = time.time()
    network = fieldnetwork.deserialize_network(path_to_serialized_model)
    api = API(network)
    if reporting:
        reporting = Report(network)
    api.init_store()
    api.help()
    el = time.time()
    print("Took " + str(el - sl) + " to load all data")
    return api, reporting
class TestReporting(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()
    # read graph
    path = '../test/test4/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()

    def test_compute_statistics(self):
        r = Report(self.network)
        ncols = r.num_columns
        ntables = r.num_tables
        ncontent = r.num_content_sim_relations
        nschema = r.num_schema_sim_relations
        npkfk = r.num_pkfk_relations
        print("Num cols: " + str(ncols))
        print("Num tables: " + str(ntables))
        print("Num content sim relations: " + str(ncontent))
        print("Num schema sim relations: " + str(nschema))
        print("Num PKFK relations: " + str(npkfk))
Example #9
0
class TestApiutils(unittest.TestCase):

    api = API(None)

    def test_drs_field_iteration(self):
        print(self._testMethodName)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN))
        drs.set_fields_mode()

        for el in drs:
            print(str(el))

        self.assertTrue(True)

    def test_drs_table_iteration(self):
        print(self._testMethodName)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN))
        drs.set_table_mode()

        for el in drs:
            print(str(el))

        self.assertTrue(True)

    def test_creation_initial_provenance(self):
        print(self._testMethodName)

        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        self.assertTrue(True)

    def test_absorb_provenance(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h6, h7], Operation(OP.SCHEMA_SIM, params=[h5]))

        drs = drs1.absorb_provenance(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        init_data = set([x for x in drs1])
        merged_data = set([x for x in drs])
        new_data = init_data - merged_data

        print("Len must be 0: " + str(len(new_data)))

        self.assertTrue(len(new_data) == 0)

    def test_absorb(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h6, h7], Operation(OP.SCHEMA_SIM, params=[h5]))

        drs = drs1.absorb(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        drs1_data = set([x for x in drs1])
        drs2_data = set([x for x in drs2])
        merged_data = set([x for x in drs])

        lm = len(merged_data)
        lu = len(drs1_data.union(drs2_data))

        print("Len must be 0: " + str(lu - lm))

        self.assertTrue((lu - lm) == 0)

    def test_intersection(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.intersection(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 1: " + str(ld))

        self.assertTrue(ld == 1)

    def test_union(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.union(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 7: " + str(ld))

        self.assertTrue(ld == 7)

    def test_sdifference(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.set_difference(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 4: " + str(ld))

        self.assertTrue(ld == 4)
class TestDDApiPathQueries(unittest.TestCase):

    # create store handler
    store_client = StoreHandler()
    # read graph
    path = 'models/chemical/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()
    """
    TC primitive API
    """
    def test_paths_between_field_mode(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        field2 = ('chembl_21', 'compound_records', 'record_id')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

    def test_paths_between_table_mode(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        field2 = ('chembl_21', 'compound_records', 'record_id')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        drs1.set_table_mode()
        drs2.set_table_mode()

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        print("Paths: ")
        res.visualize_provenance()
        res.debug_print()
        paths = res.paths()
        for p in paths:
            print(str(p))

    def test_paths_between_from_tables(self):
        print(self._testMethodName)

        table1_name = "drug_indication"
        table2_name = "compound_records"
        table1 = self.api.drs_from_table(table1_name)
        table2 = self.api.drs_from_table(table2_name)
        table1.set_table_mode()
        table2.set_table_mode()
        res = self.api.paths_between(table1, table2, Relation.PKFK)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        print("Paths: ")
        paths = res.paths()
        for p in paths:
            print(str(p))

    def test_paths(self):
        print(self._testMethodName)

        return

    def test_traverse(self):
        print(self._testMethodName)

        field1 = ('chembl_21', 'drug_indication', 'record_id')
        drs_field = self.api.drs_from_raw_field(field1)
        res = self.api.traverse(drs_field, Relation.SCHEMA_SIM, 1)

        data = [x for x in res]
        print("Total results: " + str(len(data)))
        for el in data:
            print(str(el))

        return
class TestProvenance(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()
    # read graph
    path = '../test/test4/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()

    def test_keyword_provenance(self):
        print(self._testMethodName)

        res = self.api.keyword_search("Madden", max_results=10)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_content_sim_provenance(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_content_to_table(table)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_intersection_provenance(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.intersection(res2)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)

    def test_tc_table_mode_provenance(self):
        print(self._testMethodName)

        field1 = ('dwhsmall', 'All_olap2_uentity_desc_uses.csv',
                  'Entity Owner')
        field2 = ('dwhsmall', 'All_olap_entity_desc_uses.csv', 'Entity Owner')

        drs1 = self.api.drs_from_raw_field(field1)
        drs2 = self.api.drs_from_raw_field(field2)

        drs1.set_table_mode()
        drs2.set_table_mode()

        res = self.api.paths_between(drs1, drs2, Relation.PKFK)

        print(res.get_provenance().prov_graph().nodes())
        print(res.get_provenance().prov_graph().edges())

        el_interest = [x for x in res][0]

        info = res.why(el_interest)
        print("WHY " + str(el_interest) + "? " + str(info))

        explanation = res.how(el_interest)
        print("HOW " + str(el_interest) + "? " + str(explanation))

        self.assertTrue(True)
Example #12
0
class TestDDApi(unittest.TestCase):

    # create store handler
    store_client = StoreHandler()
    # read graph
    path = 'models/dwh/'
    network = deserialize_network(path)
    api = API(network)
    api.init_store()
    """
    Seed API
    """
    def test_drs_from_raw_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        for el in res:
            print(str(el))

    def test_drs_from_hit(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        els = [x for x in res]
        el = els[0]

        res = self.api.drs_from_hit(el)

        for el in res:
            print(str(el))

    def test_drs_from_table(self):
        print(self._testMethodName)

        table = 'Iap_subject_person.csv'
        res = self.api.drs_from_table(table)

        for el in res:
            print(el)

    def test_drs_from_table_hit(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.drs_from_raw_field(field)

        els = [x for x in res]
        el = els[0]

        res = self.api.drs_from_table_hit(el)

        for el in res:
            print(str(el))

    """
    Primitive API
    """

    def test_keyword_search(self):
        print(self._testMethodName)

        res = self.api.keyword_search("Madden", max_results=10)

        for el in res:
            print(str(el))

    def test_keywords_search(self):
        print(self._testMethodName)

        res = self.api.keywords_search(["Madden", "Stonebraker", "Liskov"])

        for el in res:
            print(str(el))

    def test_schema_name_search(self):
        print(self._testMethodName)

        res = self.api.schema_name_search("Name", max_results=10)

        for el in res:
            print(str(el))

    def test_schema_names_search(self):
        print(self._testMethodName)

        res = self.api.schema_names_search(["Name", "Last Name", "Employee"])

        for el in res:
            print(str(el))

    def test_entity_search(self):
        print(self._testMethodName)

        print("Future Work...")
        return

    def test_schema_neighbors(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.schema_neighbors(field)

        for el in res:
            print(str(el))

    def test_schema_neighbors_of(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Iap_subject_person.csv', 'Person Mit Affiliation')
        res = self.api.schema_neighbors(field)

        res = self.api.schema_neighbors_of(res)

        for el in res:
            print(str(el))

    def test_similar_schema_name_to_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_schema_name_to_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_ids_functions(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Key')
        drs1 = self.api.drs_from_raw_field(field)

        field = ('mitdwh', 'Building Key', 'Buildings.csv')
        drs2 = self.api.drs_from_raw_field(field)

        for el in drs1:
            print(str(el))
        for el in drs2:
            print(str(el))

    def test_similar_schema_name_to_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_schema_name_to_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_schema_name_to(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Key')
        res = self.api.similar_schema_name_to_field(field)

        res = self.api.similar_schema_name_to(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_content_to_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.similar_content_to_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_similar_content_to(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.similar_content_to_field(field)

        res = self.api.similar_content_to(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_field(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.pkfk_field(field)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_table(self):
        print(self._testMethodName)

        table = 'Buildings.csv'
        res = self.api.pkfk_table(table)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    def test_pkfk_of(self):
        print(self._testMethodName)

        field = ('mitdwh', 'Buildings.csv', 'Building Name')
        res = self.api.pkfk_field(field)

        res = self.api.pkfk_of(res)

        print("RES size: " + str(res.size()))
        for el in res:
            print(str(el))

    """
    Combiner API
    """

    def test_intersection(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.intersection(res2)

        for el in res:
            print(str(el))

    def test_union(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.schema_name_search("Stonebraker", max_results=10)

        res = res1.union(res2)

        for el in res:
            print(str(el))

    def test_difference(self):
        print(self._testMethodName)

        res1 = self.api.keyword_search("Madden", max_results=10)
        res2 = self.api.keyword_search("Stonebraker", max_results=10)

        res = res1.set_difference(res2)

        for el in res:
            print(str(el))

    """
    Other, bugs, etc
    """

    def test_iter_edges_with_data_bug(self):
        table = "Fac_building.csv"  # The table of interest
        # We get the representation of that table in DRS
        table_drs = self.api.drs_from_table(table)
        # similar tables are those with similar content
        content_similar = self.api.similar_content_to(table_drs)
        schema_similar = self.api.similar_schema_name_to(
            table_drs)  # similar attribute names
        # some pkfk relationship involved too
        pkfk_similar = self.api.pkfk_of(table_drs)
        # similar tables are similar in content and schema
        inters1 = self.api.intersection(content_similar, schema_similar)
        similar_tables = self.api.intersection(inters1, pkfk_similar)
        similar_tables.print_tables()
Example #13
0
class TestRanking(unittest.TestCase):
    # create store handler
    store_client = StoreHandler()

    # create synthetic graph
    network = GENSYN(5, 5, 20, 50, 10)

    api = API(network)
    api.init_store()

    def test_compute_ranking_scores_certainty(self):

        nodes = self.network.fields_degree(3)

        #self.network._visualize_graph()

        nids = [x for x, y in nodes]

        info = self.network.get_info_for(nids)
        hits = self.network.get_hits_from_info(info)

        drs_info = self.api.drs_from_hits(hits)

        #drs_info.visualize_provenance()

        res = self.api.similar_schema_name_to(drs_info)

        #res.visualize_provenance(labels=True)

        res = res.rank_coverage()

        res.pretty_print_columns_with_scores()

        self.assertTrue(True)

    def test_ranking_certainty_chem(self):
        path = '../models/chemical/'
        network = deserialize_network(path)
        api = API(network)
        api.init_store()

        table = 'activities'
        table_drs = api.drs_from_table(table)

        sim_tables = api.similar_content_to(table_drs)

        sim_tables.rank_certainty()

        print("All columns CERTAINTY: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables CERTAINTY: ")
        sim_tables.print_tables_with_scores()
        print("")

        sim_tables.rank_coverage()

        print("All columns COVERAGE: ")
        sim_tables.pretty_print_columns_with_scores()
        print("")
        print("All tables COVERAGE: ")
        sim_tables.print_tables_with_scores()
        print("")

    """