def _assemble_topk_table(topk_heap, ltable, rtable, ret_key='_id', l_output_prefix='ltable_', r_output_prefix='rtable_'): topk_heap.sort(key=lambda tup: tup[0], reverse=True) ret_data_col_name_list = ['_id', 'similarity'] ltable_col_names = list(ltable.columns) rtable_col_names = list(rtable.columns) lkey = em.get_key(ltable) rkey = em.get_key(rtable) lkey_index = 0 rkey_index = 0 for i in range(len(ltable_col_names)): if ltable_col_names[i] == lkey: lkey_index = i for i in range(len(rtable_col_names)): if rtable_col_names[i] == rkey: rkey_index = i ret_data_col_name_list.append(l_output_prefix + lkey) ret_data_col_name_list.append(r_output_prefix + rkey) ltable_col_names.remove(lkey) rtable_col_names.remove(rkey) for i in range(len(ltable_col_names)): ret_data_col_name_list.append(l_output_prefix + ltable_col_names[i]) for i in range(len(rtable_col_names)): ret_data_col_name_list.append(r_output_prefix + rtable_col_names[i]) ret_tuple_list = [] for i in range(len(topk_heap)): tup = topk_heap[i] lrecord = list(ltable.ix[tup[1]]) rrecord = list(rtable.ix[tup[2]]) ret_tuple = [i, tup[0]] ret_tuple.append(lrecord[lkey_index]) ret_tuple.append(rrecord[rkey_index]) for j in range(len(lrecord)): if j != lkey_index: ret_tuple.append(lrecord[j]) for j in range(len(rrecord)): if j != rkey_index: ret_tuple.append(rrecord[j]) ret_tuple_list.append(ret_tuple) data_frame = pd.DataFrame(ret_tuple_list) # When the ret data frame is empty, we cannot assign column names. if len(data_frame) == 0: return data_frame data_frame.columns = ret_data_col_name_list lkey = em.get_key(ltable) rkey = em.get_key(rtable) cm.set_candset_properties(data_frame, ret_key, l_output_prefix + lkey, r_output_prefix + rkey, ltable, rtable) return data_frame
def test_assemble_topk_table_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4), (0.16666666666666666, 0, 3)] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) expected_columns = ['_id', 'ltable_ID', 'rtable_ID', 'ltable_name', 'ltable_birth_year', 'ltable_hourly_wage', 'ltable_address', 'ltable_zipcode', 'rtable_name', 'rtable_birth_year', 'rtable_hourly_wage', 'rtable_address', 'rtable_zipcode'] self.assertEqual(len(ret_dataframe), 3) self.assertEqual(list(ret_dataframe.columns), expected_columns) expected_recs = [[0, 'a2', 'b1', 'Michael Franklin', 1988, 27.5, '1652 Stockton St, San Francisco', 94122, 'Mark Levene', 1987, 29.5, '108 Clement St, San Francisco', 94107], [1, 'a1', 'b5', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Alfons Kemper', 1984, 35.0, '170 Post St, Apt 4, San Francisco', 94122], [2, 'a1', 'b4', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Joseph Kuan', 1982, 26.0, '108 South Park, San Francisco', 94122]] self.assertEqual(list(ret_dataframe.loc[0]), expected_recs[0]) self.assertEqual(list(ret_dataframe.loc[1]), expected_recs[1]) self.assertEqual(list(ret_dataframe.loc[2]), expected_recs[2])
def validate_metadata_two_candsets(C, D): assert_equal(sorted(C.columns), sorted(D.columns)) assert_equal(em.get_key(D), em.get_key(C)) assert_equal(em.get_property(D, 'fk_ltable'), em.get_property(C, 'fk_ltable')) assert_equal(em.get_property(D, 'fk_rtable'), em.get_property(C, 'fk_rtable'))
def test_assemble_topk_table_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4), (0.16666666666666666, 0, 3)] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) expected_columns = ['_id', 'ltable_ID', 'rtable_ID', 'ltable_name', 'ltable_birth_year', 'ltable_hourly_wage', 'ltable_address', 'ltable_zipcode', 'rtable_name', 'rtable_birth_year', 'rtable_hourly_wage', 'rtable_address', 'rtable_zipcode'] self.assertEqual(len(ret_dataframe), 3) self.assertEqual(list(ret_dataframe.columns), expected_columns) expected_recs = [[0, 'a2', 'b1', 'Michael Franklin', 1988, 27.5, '1652 Stockton St, San Francisco', 94122, 'Mark Levene', 1987, 29.5, '108 Clement St, San Francisco', 94107], [1, 'a1', 'b5', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Alfons Kemper', 1984, 35.0, '170 Post St, Apt 4, San Francisco', 94122], [2, 'a1', 'b4', 'Kevin Smith', 1989, 30.0, '607 From St, San Francisco', 94107, 'Joseph Kuan', 1982, 26.0, '108 South Park, San Francisco', 94122]] self.assertEqual(list(ret_dataframe.ix[0]), expected_recs[0]) self.assertEqual(list(ret_dataframe.ix[1]), expected_recs[1]) self.assertEqual(list(ret_dataframe.ix[2]), expected_recs[2])
def test_select_features_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) actual_selected_features = db._select_features(A, B, A_key, B_key) expected_selected_features = [1, 3, 4, 2, 5] self.assertEqual(actual_selected_features, expected_selected_features)
def test_select_features_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) actual_selected_features = db._select_features(A, B, A_key, B_key) expected_selected_features = [1, 3, 4, 2, 5] self.assertEqual(actual_selected_features, expected_selected_features)
def test_validate_types_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key = '_id') A_key = em.get_key(A) B_key = em.get_key(B) attr_corres = None db._validate_types(A, B, C, 100, attr_corres, False)
def test_assemble_topk_table_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) self.assertEqual(len(ret_dataframe), 0) self.assertEqual(list(ret_dataframe.columns), [])
def test_validate_types_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key = '_id') A_key = em.get_key(A) B_key = em.get_key(B) attr_corres = None db._validate_types(A, B, C, 100, attr_corres, False)
def test_assemble_topk_table_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) topk_heap = [] ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key) self.assertEqual(len(ret_dataframe), 0) self.assertEqual(list(ret_dataframe.columns), [])
def _assemble_topk_table(topk_heap, ltable, rtable, ret_key='_id', l_output_prefix='ltable_', r_output_prefix='rtable_'): topk_heap.sort(key=lambda tup: tup[0], reverse=True) ret_data_col_name_list = ['_id', 'similarity'] ltable_col_names = list(ltable.columns) rtable_col_names = list(rtable.columns) lkey = em.get_key(ltable) rkey = em.get_key(rtable) lkey_index = 0 rkey_index = 0 for i in range(len(ltable_col_names)): if ltable_col_names[i] == lkey: lkey_index = i for i in range(len(rtable_col_names)): if rtable_col_names[i] == rkey: rkey_index = i ret_data_col_name_list.append(l_output_prefix + lkey) ret_data_col_name_list.append(r_output_prefix + rkey) ltable_col_names.remove(lkey) rtable_col_names.remove(rkey) for i in range(len(ltable_col_names)): ret_data_col_name_list.append(l_output_prefix + ltable_col_names[i]) for i in range(len(rtable_col_names)): ret_data_col_name_list.append(r_output_prefix + rtable_col_names[i]) ret_tuple_list = [] for i in range(len(topk_heap)): tup = topk_heap[i] lrecord = list(ltable.ix[tup[1]]) rrecord = list(rtable.ix[tup[2]]) ret_tuple = [i, tup[0]] ret_tuple.append(lrecord[lkey_index]) ret_tuple.append(rrecord[rkey_index]) for j in range(len(lrecord)): if j != lkey_index: ret_tuple.append(lrecord[j]) for j in range(len(rrecord)): if j != rkey_index: ret_tuple.append(rrecord[j]) ret_tuple_list.append(ret_tuple) data_frame = pd.DataFrame(ret_tuple_list) # When the ret data frame is empty, we cannot assign column names. if len(data_frame) == 0: return data_frame data_frame.columns = ret_data_col_name_list lkey = em.get_key(ltable) rkey = em.get_key(rtable) cm.set_candset_properties(data_frame, ret_key, l_output_prefix + lkey, r_output_prefix + rkey, ltable, rtable) return data_frame
def test_select_features_3(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) corres_list = [(0, 0)] A_filtered, B_filtered = db._get_filtered_table(A, B, A_key, B_key, corres_list) actual_selected_features = db._select_features( A_filtered, B_filtered, A_key) expected_selected_features = [] self.assertEqual(actual_selected_features, expected_selected_features)
def test_get_field_correspondence_list_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) expected_list = [('ID', 'ID'), ('name', 'name'), ('address', 'address'), ('zipcode', 'zipcode')] attr_corres = [('ID', 'ID'), ('name', 'name'), ('address', 'address'), ('zipcode', 'zipcode')] corres_list = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) self.assertEqual(corres_list, expected_list)
def test_validate_types_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' + A_key, fk_rtable='rtable_' + B_key, key = '_id') attr_corres = [('ID', 'ID'), ('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')] db._validate_types(A, B, C, 100, attr_corres, False)
def test_validate_types_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' + A_key, fk_rtable='rtable_' + B_key, key = '_id') attr_corres = [('ID', 'ID'), ('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')] db._validate_types(A, B, C, 100, attr_corres, False)
def test_select_features_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]), (cols_A[4], cols_B[4])] A_filtered, B_filtered = db._get_filtered_table(A, B, corres_list) actual_selected_features = db._select_features( A_filtered, B_filtered, A_key, B_key) expected_selected_features = [1, 2] self.assertEqual(actual_selected_features, expected_selected_features)
def test_select_features_3(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) corres_list = [(cols_A[0], cols_B[0])] A_filtered, B_filtered = db._get_filtered_table(A, B, corres_list) actual_selected_features = db._select_features( A_filtered, B_filtered, A_key, B_key) expected_selected_features = [] self.assertEqual(actual_selected_features, expected_selected_features)
def test_get_field_correspondence_list_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) expected_list = [('ID', 'ID'), ('name', 'name'), ('address', 'address'), ('zipcode', 'zipcode')] attr_corres = [('ID', 'ID'), ('name', 'name'), ('address', 'address'), ('zipcode', 'zipcode')] corres_list = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) self.assertEqual(corres_list, expected_list)
def test_get_feature_weight_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) corres_list = [(0, 0), (1, 1), (4, 4), (5, 5)] A_filtered, B_filtered = db._get_filtered_table( A, B, A_key, B_key, corres_list) A_wlist = db._get_feature_weight(A_filtered) expected_A_wlist = [2.0, 2.0, 2.0, 1.4] self.assertEqual(A_wlist, expected_A_wlist) B_wlist = db._get_feature_weight(B_filtered) expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333] self.assertEqual(B_wlist, expected_B_wlist)
def test_get_field_correspondence_list_5(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') A_key = em.get_key(A) B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]]) B.columns = ['item_name', 'item_id', 'item_desc', 'item_price'] em.set_key(B, 'item_id') B_key = em.get_key(B) attr_corres = [('name', 'item_name'), ('price', 'item_price')] actual_attr_corres = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'), ('ID', 'item_id')] self.assertEqual(expected_attr_corres, actual_attr_corres)
def test_filter_corres_list_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')] expected_filtered_attr = [('ID', 'ID'), ('name', 'name'), ('address', 'address')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres) self.assertEqual(expected_filtered_attr, attr_corres)
def test_filter_corres_list_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')] expected_filtered_attr = [('ID', 'ID'), ('name', 'name'), ('address', 'address')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres) self.assertEqual(expected_filtered_attr, attr_corres)
def test_select_features_4(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) A_field_set = [0, 1, 2] B_field_set = [0, 1, 2, 3] A_field_set = list(itemgetter(*A_field_set)(cols_A)) B_field_set = list(itemgetter(*B_field_set)(cols_B)) A_filtered = A[A_field_set] B_filtered = B[B_field_set] db._select_features( A_filtered, B_filtered, A_key, B_key)
def test_get_feature_weight_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]), (cols_A[4], cols_B[4]), (cols_A[5], cols_B[5])] A_filtered, B_filtered = db._get_filtered_table( A, B, A_key, B_key, corres_list) A_wlist = db._get_feature_weight(A_filtered) expected_A_wlist = [2.0, 2.0, 2.0, 1.4] self.assertEqual(A_wlist, expected_A_wlist) B_wlist = db._get_feature_weight(B_filtered) expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333] self.assertEqual(B_wlist, expected_B_wlist)
def test_get_field_correspondence_list_5(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') A_key = em.get_key(A) B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]]) B.columns = ['item_name', 'item_id', 'item_desc', 'item_price'] em.set_key(B, 'item_id') B_key = em.get_key(B) attr_corres = [('name', 'item_name'), ('price', 'item_price')] actual_attr_corres = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'), ('ID', 'item_id')] self.assertEqual(expected_attr_corres, actual_attr_corres)
def test_select_features_4(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) A_field_set = [0, 1, 2] B_field_set = [0, 1, 2, 3] A_field_set = list(itemgetter(*A_field_set)(cols_A)) B_field_set = list(itemgetter(*B_field_set)(cols_B)) A_filtered = A[A_field_set] B_filtered = B[B_field_set] db._select_features( A_filtered, B_filtered, A_key, B_key)
def test_get_feature_weight_2(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) B_key = em.get_key(B) cols_A = list(A.columns) cols_B = list(B.columns) corres_list = [(cols_A[0], cols_B[0]), (cols_A[1], cols_B[1]), (cols_A[4], cols_B[4]), (cols_A[5], cols_B[5])] A_filtered, B_filtered = db._get_filtered_table( A, B, corres_list) A_wlist = db._get_feature_weight(A_filtered) expected_A_wlist = [2.0, 2.0, 2.0, 1.4] self.assertEqual(A_wlist, expected_A_wlist) B_wlist = db._get_feature_weight(B_filtered) expected_B_wlist = [2.0, 2.0, 2.0, 1.3333333333333333] self.assertEqual(B_wlist, expected_B_wlist)
def test_select_features_5(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') A_key = em.get_key(A) A_field_set = [0, 1, 2, 3] B_field_set = [0, 1, 2] A_filtered = A[A_field_set] B_filtered = B[B_field_set] db._select_features( A_filtered, B_filtered, A_key)
def test_get_tokenized_table_2(self): B = read_csv_metadata(path_b, key='ID') B_key = em.get_key(B) feature_list = [0, 1, 3] actual_record_list = db._get_tokenized_table(B, B_key, feature_list) expected_record_list = [[('b1', 0), ('mark', 1), ('levene', 1), ('30', 2)], [('b2', 0), ('bill', 1), ('bridge', 1), ('32', 2)], [('b3', 0), ('mike', 1), ('franklin', 1), ('28', 2)], [('b4', 0), ('joseph', 1), ('kuan', 1), ('26', 2)], [('b5', 0), ('alfons', 1), ('kemper', 1), ('35', 2)], [('b6', 0), ('michael',1), ('brodie', 1), ('32', 2)]] self.assertEqual(actual_record_list, expected_record_list)
def test_get_tokenized_table_2(self): B = read_csv_metadata(path_b, key='ID') B_key = em.get_key(B) feature_list = [0, 1, 3] actual_record_list = db._get_tokenized_table(B, B_key, feature_list) expected_record_list = [[('b1', 0), ('mark', 1), ('levene', 1), ('30', 2)], [('b2', 0), ('bill', 1), ('bridge', 1), ('32', 2)], [('b3', 0), ('mike', 1), ('franklin', 1), ('28', 2)], [('b4', 0), ('joseph', 1), ('kuan', 1), ('26', 2)], [('b5', 0), ('alfons', 1), ('kemper', 1), ('35', 2)], [('b6', 0), ('michael',1), ('brodie', 1), ('32', 2)]] self.assertEqual(actual_record_list, expected_record_list)
def test_get_tokenized_table_2(self): B = read_csv_metadata(path_b, key='ID') B_key = em.get_key(B) feature_list = [0, 1, 3] actual_record_list = db._get_tokenized_table(B, B_key, feature_list) expected_record_list = [] test_file_path = os.sep.join([debugblocker_datasets_path, 'test_get_tokenized_table_2.txt']) f = open(test_file_path, 'r') for line in f: expected_record_list.append(line.strip().split(' ')) self.assertEqual(actual_record_list, expected_record_list)
def test_get_tokenized_table_1(self): A = read_csv_metadata(path_a, key='ID') A_key = em.get_key(A) feature_list = range(len(A.columns)) actual_record_list = db._get_tokenized_table(A, A_key, feature_list) expected_record_list = [] test_file_path = os.sep.join([debugblocker_datasets_path, 'test_get_tokenized_table_1.txt']) f = open(test_file_path, 'r') for line in f: expected_record_list.append(line.strip().split(' ')) self.assertEqual(actual_record_list, expected_record_list)
def validate_metadata(C, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', l_key='ID', r_key='ID'): s1 = ['_id', l_output_prefix + l_key, r_output_prefix + r_key] if l_output_attrs: s1 += [l_output_prefix + x for x in l_output_attrs if x != l_key] if r_output_attrs: s1 += [r_output_prefix + x for x in r_output_attrs if x != r_key] s1 = sorted(s1) assert_equal(s1, sorted(C.columns)) assert_equal(em.get_key(C), '_id') assert_equal(em.get_property(C, 'fk_ltable'), l_output_prefix + l_key) assert_equal(em.get_property(C, 'fk_rtable'), r_output_prefix + r_key)
def validate_metadata(C, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', l_key='ID', r_key='ID'): s1 = ['_id', l_output_prefix + l_key, r_output_prefix + r_key] if l_output_attrs: s1 += [l_output_prefix + x for x in l_output_attrs if x != l_key] if r_output_attrs: s1 += [r_output_prefix + x for x in r_output_attrs if x != r_key] s1 = sorted(s1) assert_equal(s1, sorted(C.columns)) assert_equal(em.get_key(C), '_id') assert_equal(em.get_property(C, 'fk_ltable'), l_output_prefix + l_key) assert_equal(em.get_property(C, 'fk_rtable'), r_output_prefix + r_key)
def test_get_tokenized_table_1(self): A = read_csv_metadata(path_a, key='ID') A_key = em.get_key(A) feature_list = range(len(A.columns)) actual_record_list = db._get_tokenized_table(A, A_key, feature_list) expected_record_list = [[('a1', 0), ('kevin', 1), ('smith', 1), ('1989', 2), ('30', 3), ('607', 4), ('from', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107',5)], [('a2', 0), ('michael', 1), ('franklin', 1), ('1988', 2), ('28', 3), ('1652', 4), ('stockton', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a3', 0), ('william', 1), ('bridge', 1), ('1986', 2), ('32', 3), ('3131', 4), ('webster', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107', 5)], [('a4', 0), ('binto', 1), ('george', 1), ('1987', 2), ('32', 3), ('423', 4), ('powell', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a5', 0), ('alphonse', 1), ('kemper', 1), ('1984', 2), ('35', 3), ('1702', 4), ('post', 4), ('street,', 4), ('san', 4), ('francisco', 4), ('94122', 5)]] self.assertEqual(actual_record_list, expected_record_list)
def test_get_tokenized_table_1(self): A = read_csv_metadata(path_a, key='ID') A_key = em.get_key(A) feature_list = range(len(A.columns)) actual_record_list = db._get_tokenized_table(A, A_key, feature_list) expected_record_list = [[('a1', 0), ('kevin', 1), ('smith', 1), ('1989', 2), ('30', 3), ('607', 4), ('from', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107',5)], [('a2', 0), ('michael', 1), ('franklin', 1), ('1988', 2), ('28', 3), ('1652', 4), ('stockton', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a3', 0), ('william', 1), ('bridge', 1), ('1986', 2), ('32', 3), ('3131', 4), ('webster', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94107', 5)], [('a4', 0), ('binto', 1), ('george', 1), ('1987', 2), ('32', 3), ('423', 4), ('powell', 4), ('st,', 4), ('san', 4), ('francisco', 4), ('94122', 5)], [('a5', 0), ('alphonse', 1), ('kemper', 1), ('1984', 2), ('35', 3), ('1702', 4), ('post', 4), ('street,', 4), ('san', 4), ('francisco', 4), ('94122', 5)]] self.assertEqual(actual_record_list, expected_record_list)
def validate_metadata_two_candsets(C, D): assert_equal(sorted(C.columns), sorted(D.columns)) assert_equal(em.get_key(D), em.get_key(C)) assert_equal(em.get_property(D, 'fk_ltable'), em.get_property(C, 'fk_ltable')) assert_equal(em.get_property(D, 'fk_rtable'), em.get_property(C, 'fk_rtable'))
def main(): # WELCOME TO MY MAGELLAN RUN SCRIPT print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n") # Get the datasets directory datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\' print("- Dataset directory: " + datasets_dir) print("- List of folders/files: ") print(os.listdir(datasets_dir)) print("- Please enter new dataset folder name:") datasets_dir += input() print("- Dataset directory set to: " + datasets_dir) dateset_dir_files = os.listdir(datasets_dir) print("- List of files in dataset folder: ") print(dateset_dir_files) # Get the path of the input table A print("- Enter an index for Table A file (0-x):") file_index_A = input() filename_A = dateset_dir_files[int(file_index_A)] print("Table A file set to: " + filename_A) # Get the path of the input table path_A = datasets_dir + os.sep + filename_A # Get the path of the input table B print("- Enter an index for Table B file (0-x):") file_index_B = input() filename_B = dateset_dir_files[int(file_index_B)] print("Table B file set to: " + filename_B) # Get the path of the input table path_B = datasets_dir + os.sep + filename_B # Print Table A column names A = em.read_csv_metadata(path_A) print("- List of columns of Table A: ") print(list(A.columns)) # Get the Table A id/primary key column name print('- Enter Table A primary key column index (ex. 0):') pk_A_index = input() pk_A = A.columns[int(pk_A_index)] # Print Table B column names B = em.read_csv_metadata(path_B) print("- List of columns of Table B: ") print(list(B.columns)) # Get the Table B id/primary key column name print('- Enter Table B primary key column index (ex. 0):') pk_B_index = input() pk_B = A.columns[int(pk_A_index)] # READING TABLES AND SETTING METADATA print("\n-------------READING TABLES AND SETTING METADATA-------------\n") # Both read csv and set metadata id as ID column #A = em.read_csv_metadata(path_A, key=pk_A) #B = em.read_csv_metadata(path_B, key=pk_B) em.set_key(A, pk_A) em.set_key(B, pk_B) # Number of tables print('- Number of tuples in A: ' + str(len(A))) print('- Number of tuples in B: ' + str(len(B))) print('- Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B))) # Print first 5 tuples of tables print(A.head()) print(B.head()) # Display the keys of the input tables print("- Table A primary key: " + em.get_key(A)) print("- Table B primary key: " + em.get_key(B)) # DOWNSAMPLING print("\n-------------DOWNSAMPING-------------\n") print("- Do you want to use downsampling? (y or n):") print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B))) print("- NOTE: Recommended if both tables have 100K+ tuples.") is_downsample = input() if (is_downsample == 'y'): print("- Size of the downsampled tables (ex. 200):") downsample_size = input() # If the tables are large we can downsample the tables like this A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False) print("- Length of Table A1" + len(A1)) print("- Length of Table B1" + len(B1)) # BLOCKING print("\n-------------BLOCKING-------------\n") print("- Do you want to use blocking? (y or n):") is_blocking = input() if (is_blocking == 'y'): # Check if the 2 tables column names are the same if (list(A.columns) == list(B.columns)): C_attr_eq = [] # Attr Equ blocker result list C_overlap = [] # Overlap blocker result list C_blackbox = [] # BlackBox blocker result list # Left and right table attribute prefixes l_prefix = "ltable_" r_prefix = "rtable_" print("\n- List of columns: ") print(list(A.columns)) # Labeling output table column selection print( "\n- Enter the indexes of columns that you want to see in labeling table (0-" + str(len(A.columns) - 1) + "):") out_attr = [] for i in range(1, len(A.columns)): print("- Finish with empty character(enter+enter) " + str(i)) add_to_attr = input() if (add_to_attr == ''): break # Get indexes from user and add columns into out_attr list out_attr.append(A.columns[int(add_to_attr)]) # Print output attributes print(out_attr) # Loop for adding/combining new blockers while (True): # Blocker selection print( "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:" ) blocker_selection = input() # ----- Attribute Equivalence Blocker ----- if (blocker_selection == 'ab'): # Create attribute equivalence blocker ab = em.AttrEquivalenceBlocker() # Counter for indexes attr_eq_counter = 0 # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_overlap[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_blackbox[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Attr Equ blocker while (True): # List column names print("\n- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False # First time using Attr Equ blocker, use A and B if (attr_eq_counter == 0): # Block using selected (blocking_col) attribute on A and B C_attr_eq.append( ab.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_attr_eq.append( ab.block_candset(C_attr_eq[-1], l_block_attr=blocking_col, r_block_attr=blocking_col, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print( "\n- Attribute Equivalence Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_attr_eq[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) attr_eq_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_attr_eq[-1]))) print( "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:" ) ab_next_operation = input() if (not ab_next_operation.islower()): ab_next_operation = ab_next_operation.lower( ) # Lower case # Continue using Attribute Equivalence Blocker if (ab_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ab_next_operation == 'r'): del C_attr_eq[-1] print("\n- Last blocker output removed!") print( "- Continue to use Attribute Equivalence Blocker (y or n):" ) ab_next_operation = input() if (ab_next_operation == 'n'): break # Finish Attribute Equivalence Blocker else: break # ----- Overlap Blocker ----- elif (blocker_selection == 'ob'): # Create attribute equivalence blocker ob = em.OverlapBlocker() # Counter for indexes overlap_counter = 0 # Check if Attribute Equivalence Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_attr_eq[-1]) # Add last output of ab overlap_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_blackbox[-1]) # Add last output of ob overlap_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Overlap blocker while (True): # List column names print("- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "- Which column (w/ index) to use for overlap blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False print("\n- Use words as a token? (y or n):") use_world_level = input() if (use_world_level == 'y'): use_world_level = True q_gram_value = None else: use_world_level = False print( "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):" ) q_gram_value = input() q_gram_value = int(q_gram_value) print( "\n- Enter the overlap size (# of tokens that overlap):" ) overlap_size = input() overlap_size = int(overlap_size) print( "\n- Do you want to remove (a, an, the) from token set? (y or n):" ) use_stop_words = input() if (use_stop_words == 'y'): use_stop_words = True else: use_stop_words = False # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block using selected (blocking_col) attribute on A and B C_overlap.append( ob.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_overlap.append( ob.block_candset(C_overlap[-1], l_overlap_attr=blocking_col, r_overlap_attr=blocking_col, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- Overlap Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_overlap[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) overlap_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_overlap[-1]))) print( "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:" ) ob_next_operation = input() if (not ob_next_operation.islower()): ob_next_operation = ob_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (ob_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ob_next_operation == 'r'): del C_overlap[-1] print("\n- Last blocker output removed!") print( "- Continue to use Overlap Blocker (y or n):") ob_next_operation = input() if (ob_next_operation == 'n'): break # Finish Overlap Blocker else: break # ----- BlackBox Blocker ----- elif (blocker_selection == 'bb'): # Create attribute equivalence blocker bb = em.BlackBoxBlocker() # Counter for indexes blackbox_counter = 0 # Check if Overlap Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_attr_eq[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_overlap[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into BlackBox blocker while (True): # Set function bb.set_black_box_function( number_10_percent_comparision) # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block on A and B C_blackbox.append( bb.block_tables(A, B, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, n_jobs=-1, show_progress=False)) # Not first time, add new constraint into previous candidate set else: # Block on previous (last=-1) candidate set C_blackbox.append( bb.block_candset(C_blackbox[-1], n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- BlackBox Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_blackbox[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) blackbox_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_blackbox[-1]))) print( "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:" ) bb_next_operation = input() if (not bb_next_operation.islower()): bb_next_operation = bb_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (bb_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (bb_next_operation == 'r'): del C_blackbox[-1] print("\n- Last blocker output removed!") print( "- Continue to use BlackBox Blocker (y or n):") bb_next_operation = input() if (bb_next_operation == 'n'): break # Finish BlackBox Blocker else: break print("\n- Do you want to add/use another blocker? (y or n):") blocker_decision = input() if (blocker_decision == 'n'): break print( "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)" ) blocker_output_selection = input() # Attribute Equ if (blocker_output_selection == "ab"): C = C_attr_eq[-1] # Overlap elif (blocker_output_selection == "ob"): C = C_overlap[-1] # Overlap elif (blocker_output_selection == "bb"): C = C_blackbox[-1] # Union of blockers elif (blocker_output_selection == "un"): # Combine/union blockers candidate sets print("\n- TODO: Unions Attr Equ and Overlap only!") if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and not C_overlap[-1].empty): # Both blocker types used C = em.combine_blocker_outputs_via_union( [C_attr_eq[-1], C_overlap[-1]]) print( "\n- Blockers candidate set outputs combined via union." ) else: # Error C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) # Error else: C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) print("\n- Length of C: " + str(len(C))) else: print( "\n- 2 Tables column names are different, they must be the same" ) print(list(A.columns)) print(list(B.columns)) # SAMPLING&LABELING print("\n-------------SAMPLING&LABELING-------------\n") print("- Choose sampling size (eg. 450):") sampling_size = input() while (int(sampling_size) > len(C)): print("- Sampling size cannot be bigger than " + str(len(C))) sampling_size = input() # Sample candidate set S = em.sample_table(C, int(sampling_size)) print("- New window will pop-up for " + sampling_size + " sized table.") print("- If there is a match, change tuple's label value to 1") # Label S G = em.label_table(S, 'label') #DEVELOPMENT AND EVALUATION print("\n-------------DEVELOPMENT AND EVALUATION-------------\n") # Split S into development set (I) and evaluation set (J) IJ = em.split_train_test(G, train_proportion=0.7, random_state=0) I = IJ['train'] J = IJ['test'] #SELECTING THE BEST MATCHER print("\n-------------SELECTING THE BEST MATCHER-------------\n") # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') print( "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB") print("\n- Creating features...") # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) print("\n- Features list:") # List the names of the features generated print(feature_table['feature_name']) print("\n- Converting the development set to feature vectors...") # Convert the I into a set of feature vectors using feature_table H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Primary key of tables = prefix + pk = l_id, r_id ltable_pk = l_prefix + pk_A rtable_pk = r_prefix + pk_B # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean', val_all_nans=0.0) #print("\n- Feature table first rows:") # Display first few rows #print(H.head()) print("- Impute table function used for missing values.") print("\n- Selecting the best matcher using cross-validation...") # Select the best ML matcher using CV result = em.select_matcher( matchers=[dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #DEBUGGING THE MATCHER print("\n-------------DEBUGGING THE MATCHER-------------\n") # Split feature vectors into train and test UV = em.split_train_test(H, train_proportion=0.5) U = UV['train'] V = UV['test'] # Debug decision tree using GUI em.vis_debug_rf(rf, U, V, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Do you want to add another feature?") H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Select the best ML matcher using CV result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #EVALUATING THE MATCHING OUTPUT print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n") print("\n- Converting the evaluation set to feature vectors...") # Convert J into a set of feature vectors using feature table L = em.extract_feature_vecs(J, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(L)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. L = em.impute_table( L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(L.head()) print("\n- Training the selected matcher...") # Train using feature vectors from I rf.fit(table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Predicting the matches...") # Predict on L predictions = rf.predict( table=L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], append=True, target_attr='predicted', inplace=False) print("\n- Evaluating the prediction...") # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') print(em.print_eval_summary(eval_result)) print("\n- Time elapsed:") print(datetime.now() - startTime) print("\n-------------END-------------\n")
path_A = '../data/A.csv' path_B = '../data/B.csv' # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') print('Number of tuples in A: ' + str(len(A))) print('Number of tuples in B: ' + str(len(B))) print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B))) ################################## Blocker Portion ################################## print('Begin blocking stage') # Display the key attributes of table A and B. em.get_key(A), em.get_key(B) # Create attribute equivalence blocker ab = em.AttrEquivalenceBlocker() # Block tables using 'year' attribute : same year include in candidate set C1 = ab.block_tables(A, B, 'Release Date', 'Release Date', l_output_attrs=[ 'Title', 'Genre', 'Score', 'Release Date', 'Rating', 'Directed By', 'Written By', 'Studio' ], r_output_attrs=[ 'Title', 'Genre', 'Score', 'Release Date', 'Rating', 'Directed By', 'Written By', 'Studio'
def test_build_id_to_index_map_1(self): A = read_csv_metadata(path_a, key='ID') key = em.get_key(A) actual_rec_id_to_idx = db._build_id_to_index_map(A, key) expected_rec_id_to_idx = {'a1': 0, 'a3': 2, 'a2': 1, 'a5': 4, 'a4': 3} self.assertEqual(actual_rec_id_to_idx, expected_rec_id_to_idx)
def test_build_id_to_index_map_1(self): A = read_csv_metadata(path_a, key='ID') key = em.get_key(A) actual_rec_id_to_idx = db._build_id_to_index_map(A, key) expected_rec_id_to_idx = {'a1': 0, 'a3': 2, 'a2': 1, 'a5': 4, 'a4': 3} self.assertEqual(actual_rec_id_to_idx, expected_rec_id_to_idx)