def test1_smallset(self): path_input = "test/data/schema_matching/default_matches_cities_input.csv" df = pd.read_csv(path_input) pairs_relational = relational_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_string = string_similarity_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_schema = label_schema_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_overlap = value_overlap_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) assert all([pairs_relational.equals(x) for x in [pairs_string, pairs_schema, pairs_overlap]])
def test3_numeric_data(self): path_input = "test/data/schema_matching/value_matches_cities_numeric_input.csv" df = pd.read_csv(path_input) path_expected = "test/data/schema_matching/value_matches_cities_numeric_expected.csv" expected_matches = pd.read_csv(path_expected) output_matches = value_overlap_matching(df) output_matches['value_overlap'] = pd.to_numeric(output_matches['value_overlap']) pd.testing.assert_frame_equal( output_matches, expected_matches, check_like=True)
def test4_no_matches_numeric_data(self): df = pd.DataFrame({ 'city' : [1, 1, 0, 1], 'entity' : ['Bremen', 'Hamburg', 'Denmark', 'Berlin'], 'new_link_1': ['http://dbpedia.org/resource/Bremen', 'http://dbpedia.org/resource/Hamburg', 'http://dbpedia.org/resource/Denmark', 'http://dbpedia.org/resource/Berlin'], 'Link_Out_numeric_http://dbpedia.org/ontology/PopulatedPlace/areaMetro': [1, 0, 0, 0], 'Link_Out_numeric_http://dbpedia.org/ontology/abstract': [12, 12, 11, 12] }) expected_result_df = pd.DataFrame({ 'uri_1' : ['http://dbpedia.org/ontology/PopulatedPlace/areaMetro'], 'uri_2' : ['http://dbpedia.org/ontology/abstract'], 'value_overlap': [0.0] }) result = value_overlap_matching(df) pd.testing.assert_frame_equal( result, expected_result_df, check_like=True)
def test2_no_matches_boolean_data(self): df = pd.DataFrame({ 'city' : [1, 1, 0, 1], 'entity' : ['Bremen', 'Hamburg', 'Denmark', 'Berlin'], 'new_link_1': ['http://dbpedia.org/resource/Bremen', 'http://dbpedia.org/resource/Hamburg', 'http://dbpedia.org/resource/Denmark', 'http://dbpedia.org/resource/Berlin'], 'new_link_in_boolean_http://dbpedia.org/resource/Category:German_state_capitals': [True, True, False, True], 'new_link_in_boolean_http://dbpedia.org/resource/Category:Countries_in_Europe': [False, False, True, False] }) expected_result_df = pd.DataFrame({ 'uri_1' : ['new_link_in_boolean_http://dbpedia.org/resource/Category:Countries_in_Europe'], 'uri_2' : ['http://dbpedia.org/resource/Category:German_state_capitals'], 'value_overlap': [0.0] }) result = value_overlap_matching(df) pd.testing.assert_frame_equal( result, expected_result_df, check_like=True)
def test2_bigset(self): #WARNING: Takes long to run! path_input = "test/data/schema_matching/pair_equality_test2_bigset.csv" df = pd.read_csv(path_input) pairs_relational = relational_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_string = string_similarity_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_schema = label_schema_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) pairs_overlap = value_overlap_matching(df)[["uri_1","uri_2"]].sort_values(by=["uri_1","uri_2"]).reset_index(drop=True) assert all([pairs_relational.equals(x) for x in [pairs_string, pairs_schema, pairs_overlap]])