def test_benchmark_agdistis():
    agdistis_wrapper = AgdistisWrapper()
    onlyfiles = [
        f for f in listdir(ENTITIES_DIR) if isfile(join(ENTITIES_DIR, f))
    ]
    num = 0
    while True:
        try:
            _id = onlyfiles[num]
            print("process table %d out of %d" % (num, len(onlyfiles)),
                  flush=True)
            print("table id %s" % (_id), flush=True)
            fixture_entities = get_gold_standard_entities(_id)
            _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id)
            _table.init()
            agdistis_entities = agdistis_wrapper.disambiguate_table(_table)
            to_compare = map_agdistis_entities_to_gold_standard_format(
                _table, agdistis_entities)
            print("", flush=True)
            print(fixture_entities, flush=True)
            print("", flush=True)
            print(to_compare, flush=True)
            print(diff_entities(fixture_entities, to_compare), flush=True)
            num += 1
            if (num >= len(onlyfiles)):
                break
        except BaseException as e:
            print(str(e))
Beispiel #2
0
def test_map_table_properties():
    table = GenericTable(TEST_FILENAME)
    table.init()
    properties = map_table_properties(table)
    assert "uri" in properties[0].keys()
    assert "prefixed_name" in properties[0].keys()
    assert "score" in properties[0].keys()
def test_benchmark_dbpedia_lookup_subject_columns_only():
    onlyfiles = [
        f for f in listdir(ENTITIES_DIR) if isfile(join(ENTITIES_DIR, f))
    ]
    scidentifier = SCIdentifier()
    num = 0
    while True:
        try:
            _id = onlyfiles[num]
            print("process table %d out of %d" % (num, len(onlyfiles)),
                  flush=True)
            print("table id %s" % (_id), flush=True)
            fixture_entities = get_gold_standard_entities(_id)
            _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id)
            _table.init()
            _subject_columns = scidentifier.identify_subject_column(_table)
            if _subject_columns:
                _table.subject_column = _subject_columns[0]
            dbpedia_lookup_entities = disambiguate_table_subject_column_only(
                _table)
            to_compare = map_agdistis_entities_to_gold_standard_format(
                _table, dbpedia_lookup_entities)
            print("", flush=True)
            print(fixture_entities, flush=True)
            print("", flush=True)
            print(to_compare, flush=True)
            print(diff_entities(fixture_entities, to_compare), flush=True)
            num += 1
            if (num >= len(onlyfiles)):
                break
        except BaseException as e:
            print(str(e))
Beispiel #4
0
def test_disambiguate_table_subject_column_only_case_1():
    table = GenericTable()
    table.table = CASE_1_TABLE
    table.subject_column = 1
    entities = disambiguate_table_subject_column_only(table)
    import ipdb
    ipdb.set_trace()
Beispiel #5
0
def test_map_atomic_table_property():
    table = GenericTable(TEST_FILENAME)
    table.init()
    _property = map_table_properties_connectivity(table)
    assert _property == {
        '0_1': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
    }
Beispiel #6
0
def test_table_case():
    from taipan.pathes import TABLES_DIR
    _id = "34041816_1_4749054164534706977.csv"
    _table = GenericTable(filename=join(TABLES_DIR, _id), _id=_id)
    _table.init()
    row = _table.table[1]
    concat_dis = agdistis_wrapper.disambiguate_row(row)
    cell_dis = agdistis_wrapper._disambiguate_row(row)
Beispiel #7
0
def test_identify_subject_column_table_string():
    table_string = "\"Region\",\"Currency\",\"Price\",\"Price in ?\"\n\"Australia SA+WA\",\"AUD\",\"24.95\",\"15.91\"\n\"Israel\",\"ILS\",\"79\",\"15.03\"\n\"Australia\",\"AUD\",\"19.99\",\"12.75\"\n\"Kuwait\",\"KWD\",\"4.50\",\"11.08\"\n\"Canada\",\"CAD\",\"14.99\",\"10.02\""
    table = GenericTable("stub", csv_string=table_string)
    table.init()
    sc = SCIDENTIFIER.identify_subject_column(table)
    # table can not be predicted, [0] is returned by default
    assert isinstance(sc, list)
    assert len(sc) > 0
Beispiel #8
0
def test_generate_rdf():
    table = GenericTable(TEST_FILENAME)
    table.init()
    rdf = generate_rdf(table)

    g = rdflib.Graph()
    g.parse(data=rdf, format="n3")

    assert len(g.all_nodes()) > 1
Beispiel #9
0
 def get_additional_tables(self):
     tables = []
     subject_column_list = os.path.join(ADDITIONAL_DATA_DIR,
                                        "subject_columns.csv")
     id_list = self.load_csv(subject_column_list)
     for (_id, subject_column) in id_list:
         table_filename = os.path.join(ADDITIONAL_DATA_DIR, "tables", _id)
         table = GenericTable(filename=table_filename, _id=_id)
         table.init()
         table.table = table.table[:int(ROWS_TO_ANALYZE)]
         table.subject_column = int(subject_column)
         tables.append(table)
     return tables
Beispiel #10
0
def test_space_delimiter():
    table = GenericTable("stub", csv_string=tomtom_csv, delimiter=" ")
    table.init()
    assert table.table.shape == (18, 4)
    assert table.subject_column is None
Beispiel #11
0
def test_from_string():
    table = GenericTable("stub", csv_string=TABLE_STRING)
    table.init()
    assert len(table.table) == 28
    assert table.subject_column is None
Beispiel #12
0
def test_init():
    table = GenericTable(TEST_FILENAME)
    table.init()
    assert len(table.table) == 13
    assert table.subject_column is None
Beispiel #13
0
def test():
    table = GenericTable("stub", csv_string=TABLE_STRING, delimiter=";")
    table.init()
    sc_ident = SCIdentifier()
    subject_column = sc_ident.identify_subject_column(table)
    assert subject_column == [4, 6]