def setup(self): p = mg.get_install_path() path_for_A = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'A.csv']) path_for_B = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'B.csv']) l_key = 'id' r_key = 'id' self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) l_block_attr_1 = 'city_posted' r_block_attr_1 = 'city_posted' l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.ab = mg.AttrEquivalenceBlocker() self.C = self.ab.block_tables(self.A, self.B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, verbose=False) self.l_block_attr = 'model_year' self.r_block_attr = 'model_year'
def calc_gold_sim(gold_path, ltable_path, rtable_path, lkey, rkey, fields): ltable = mg.read_csv_metadata(ltable_path, key=lkey) rtable = mg.read_csv_metadata(rtable_path, key=rkey) gold = mg.read_csv_metadata(gold_path) sim_list = [] for tuple in gold.itertuples(): lrec = list(ltable.ix[int(tuple[1])]) rrec = list(rtable.ix[int(tuple[2])]) lset = tokenize(lrec, fields) rset = tokenize(rrec, fields) sim_list.append( (tuple[1], tuple[2], len(lset & rset) * 1.0 / len(lset | rset))) sim_list = sorted(sim_list, key=lambda x: x[2], reverse=True) cols = ltable.columns for i in range(len(sim_list)): tuple = sim_list[i] print '=====rank ' + str(i + 1) + '=====' print 'pair:(' + str(tuple[0]) + ', ' + str(tuple[1]) + ')' print 'sim:', tuple[2] lrec = list(ltable.ix[int(tuple[0])]) rrec = list(rtable.ix[int(tuple[1])]) for value in fields: print cols[value] + ':', '<left>' + str( lrec[value]) + '\t<right>' + str(rrec[value]) print '\n' return
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) self.l_block_attr = 'Brand' self.r_block_attr = 'Brand' self.l_output_attrs = ['Brand', 'Amazon_Price'] self.r_output_attrs = ['Brand', 'Price'] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) self.l_block_attr = 'PHONENUMBER' self.r_block_attr = 'PHONENUMBER' self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def setup(self): path_for_A = os.sep.join([datasets_path, 'citations', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'citations', 'B.csv']) self.l_block_attr = 'year' self.r_block_attr = 'year' self.l_output_attrs = ['title', 'author', 'year', 'ENTRYTYPE'] self.r_output_attrs = ['title', 'author', 'year', 'ENTRYTYPE'] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = ab.block_tables(A, B, 'Author', 'Author', ['Title', 'Author', 'ISBN13', 'Publisher'], ['Title', 'Author', 'ISBN13', 'Publisher']) self.l_block_attr = 'ISBN13' self.r_block_attr = 'ISBN13'
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = ab.block_tables(A, B, 'Year', 'Year', ['Title', 'Year', 'Episodes'], ['Title', 'Year', 'Episodes']) self.l_block_attr = 'Episodes' self.r_block_attr = 'Episodes'
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) self.l_block_attr = 'Author' self.r_block_attr = 'Author' self.l_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] self.r_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) self.l_block_attr = 'Author' self.r_block_attr = 'Author' self.l_output_attrs = [ 'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date' ] self.r_output_attrs = [ 'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date' ] self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID')
def test_set_key_valid(): df = mg.read_csv_metadata(path_for_A) mg.set_key(df, 'ID') assert_equal(mg.get_key(df), 'ID') mg.del_property(df, 'key') assert_equal(len(mg.get_all_properties(df)), 0)
def test_del_property_valid(): df = mg.read_csv_metadata(path_for_A) mg.set_key(df, 'ID') assert_equal(mg.get_key(df), 'ID') mg.del_property(df, 'key') assert_equal(mg.is_property_present_for_df(df, 'key'), False) mg.del_catalog()
def test_get_catalog_valid(): df = mg.read_csv_metadata(path_for_A) mg.set_key(df, 'ID') assert_equal(len(mg.get_all_properties(df)), 1) c = mg.get_catalog() assert_equal(len(c), 1) mg.del_catalog()
def calc_missed_gold_not_in_recom(cmb_file_path, cand_path, gold_path, K): gold = mg.read_csv_metadata(gold_path) gold_set = set() for tup in gold.itertuples(): gold_set.add((tup[1], tup[2])) cand_set = set() fin = open(cand_path, 'r') fin.readline() for line in fin: splits = line.split(',') cand_set.add((int(splits[1]), int(splits[2]))) union_set = {} schema, recom_lists = read_wrapped_recom_list(cmb_file_path, K) for recom_list in recom_lists: for pair in recom_list: index = (int(pair[1]), int(pair[2])) if index not in union_set: union_set[index] = pair count = 0 for gold_pair in gold_set: if gold_pair not in union_set and gold_pair not in cand_set: count += 1 print count
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'A.csv']) path_for_B = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'B.csv']) l_key = 'id' r_key = 'id' self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.ab = mg.AttrEquivalenceBlocker()
def combine_recom_lists(indir_path, outpath, ltable_path, rtable_path, field_corres, lkey, rkey): ltable = mg.read_csv_metadata(ltable_path, key=lkey) rtable = mg.read_csv_metadata(rtable_path, key=rkey) filenames = [f for f in listdir(indir_path) if f.endswith('.txt')] out = open(outpath, 'w') # columns = ltable.columns # out.write('@_@_@_@'.join(columns) + '\n') schema = [tup[0] for tup in field_corres] out.write('@_@_@_@'.join(schema) + '\n') # field_corres.remove((lkey, rkey)) print filenames for filename in filenames: fin = open(indir_path + filename) lines = fin.readlines() out.write(str(len(lines)) + '\n') recom_map = {} for line in lines: splits = line.strip().split(' ') sim = float(splits[0]) if sim in recom_map: recom_map[sim].append((splits[1], splits[2])) else: recom_map[sim] = [(splits[1], splits[2])] od = collections.OrderedDict(sorted(recom_map.items(), reverse=True)) print od rank = 1 for sim in od: items = od[sim] for item in items: lrec = ltable.iloc[int(item[0])] rrec = rtable.iloc[int(item[1])] out.write( str(rank) + '\n' + str(lrec[lkey]) + '\n' + str(rrec[rkey]) + '\n') for tup in field_corres: out.write(str(lrec[tup[0]]).replace('\n', '') + '\n') for tup in field_corres: out.write(str(rrec[tup[1]]).replace('\n', '') + '\n') rank += len(items) out.close()
def calc_recom_not_in_gold_sim(recom_path, gold_path, ltable_path, rtable_path, lkey, rkey, fields): fin = open(recom_path, 'r') lines = fin.readlines() recom_list = [] for line in lines: splits = line.split(' ') recom_list.append((splits[0], int(splits[1]), int(splits[2]))) ltable = mg.read_csv_metadata(ltable_path, key=lkey) rtable = mg.read_csv_metadata(rtable_path, key=rkey) gold = mg.read_csv_metadata(gold_path) gold_set = set() for tup in gold.itertuples(): gold_set.add((tup[1], tup[2])) sim_list = [] for tuple in recom_list: if (tuple[1], tuple[2]) not in gold_set: lrec = list(ltable.ix[tuple[1]]) rrec = list(rtable.ix[tuple[2]]) lset = tokenize(lrec, fields) rset = tokenize(rrec, fields) sim_list.append((tuple[1], tuple[2], tuple[0], len(lset & rset) * 1.0 / len(lset | rset))) sim_list = sorted(sim_list, key=lambda x: x[2], reverse=True) cols = ltable.columns for i in range(len(sim_list)): tuple = sim_list[i] print '=====rank ' + str(i + 1) + '=====' print 'pair:(' + str(tuple[0]) + ', ' + str(tuple[1]) + ')' print 'topk sim:', tuple[2] print 'new calc sim:', tuple[3] lrec = list(ltable.ix[int(tuple[0])]) rrec = list(rtable.ix[int(tuple[1])]) for value in fields: print cols[value] + ':', '<left>' + str( lrec[value]) + '\t<right>' + str(rrec[value]) print '\n' return
def get_gold_in_topk_lists(topk_dir, gold_path, cmp_config_seq): gold = mg.read_csv_metadata(gold_path) gold_set = set() for tup in gold.itertuples(): gold_set.add((tup[1], tup[2])) # print gold_set topk_lists = os.listdir(topk_dir) # print topk_lists catched_true_set = set() cmp_catched_true_set = set() union_set = set() cmp_union_set = set() if '.DS_Store' in topk_lists: topk_lists.remove('.DS_Store') for topk_list in topk_lists: selected_fields = topk_list[5:-4] count = 0 fin = open(topk_dir + '/' + topk_list) match_list = [] linecount = 0 for line in fin: linecount += 1 splits = line.split(' ') tup = (int(splits[1]) + 1, int(splits[2]) + 1) union_set.add(tup) if selected_fields in cmp_config_seq: cmp_union_set.add(tup) if tup in gold_set: match_list.append(linecount) catched_true_set.add(tup) if selected_fields in cmp_config_seq: cmp_catched_true_set.add(tup) count += 1 print topk_list + ': ' + str(count) #, match_list print 'total number of candidates:', len(union_set) print 'catched true matches:', len(catched_true_set) print 'candidates by cont. configs:', len(cmp_union_set) print 'catched true matches by cont. configs:', len(cmp_catched_true_set) return len(union_set), len(catched_true_set), len(cmp_union_set), len( cmp_catched_true_set)
def setUp(self): self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) self.ab = mg.AttrEquivalenceBlocker()
##################################################################################################### output_dir = '../results/exp/new/' use_plain = False folder = 'Walmart-Amazon' blocker_type = 'similarity-based' #cand_name = 'title_overlap<3' lkey = 'id' rkey = 'id' cand_list = [ 'title_token_cos<0.4', 'title_token_cos<0.5', 'title_token_jac<0.4', 'title_token_jac<0.5' ] for cand_name in cand_list: ltable = mg.read_csv_metadata('../datasets/exp_data/cleaned/' + folder + '/tableA.csv', key=lkey) rtable = mg.read_csv_metadata('../datasets/exp_data/cleaned/' + folder + '/tableB.csv', key=rkey) cand_set = mg.read_csv_metadata( '../datasets/exp_data/candidate_sets/' + folder + '/' + blocker_type + '/' + cand_name + '.csv', ltable=ltable, rtable=rtable, fk_ltable='ltable_' + lkey, fk_rtable='rtable_' + rkey, key='_id') output_size = 200 output_path = output_dir + folder + '/' + blocker_type + '/' + cand_name + '/' debug_blocker(ltable, rtable, cand_set, use_plain, output_path,
import magellan as mg import pandas as pd import os from PyQt4 import QtCore datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.read_csv_metadata(path_a) B = mg.read_csv_metadata(path_b, key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) D = mg.label_table(C, 'label') print(D) # timer = QtCore.QTimer() # timer.setInterval(2000) # 2 seconds # mg._viewapp.loadFinished.connect(timer.start) # timer.timeout.connect(mg._viewapp.quit)
# coding=utf-8 # coding=utf-8 import os import magellan as mg datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) feature_table = mg.get_features_for_matching(A, B) labels = [0]*7 labels.extend([1]*8) C['labels'] = labels feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = mg.RFMatcher() train_test = mg.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] rf.fit(table=train, exclude_attrs=['ltable_ID', 'rtable_ID', '_id'], target_attr='labels') mg.debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels']) print('Hi')
# coding=utf-8 import logging import os import magellan as mg logging.basicConfig(level=logging.DEBUG) datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets', 'matcherselector']) path_a = os.sep.join([datasets_path, 'ACM_demo.csv']) path_b = os.sep.join([datasets_path, 'DBLP_demo.csv']) path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv']) A = mg.read_csv_metadata(path_a, key='id') B = mg.read_csv_metadata(path_b, key='id') C = mg.read_csv_metadata(path_c, ltable=B, rtable=A, fk_ltable='ltable.id', fk_rtable='rtable.id', key='_id') feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold', verbose=True) # dtmatcher = mg.DTMatcher() # nbmatcher = mg.NBMatcher() # rfmatcher = mg.RFMatcher() # svmmatcher = mg.SVMMatcher() # linregmatcher = mg.LinRegMatcher() # logregmatcher = mg.LogRegMatcher()
def test_get_all_properties_invalid(): df = mg.read_csv_metadata(path_for_A) assert_equal(len(mg.get_all_properties(df)), 1)
# # mg.set_key(A, 'ID') # # # mg.set_metadata(A, 'key', 'ID') # # print mg.get_metadata(A, 'key') # # print mg.get_key(A) # # print mg.is_dfinfo_present(B) # # mg.set_metadata(B, 'key', 'ID') # # print mg.get_metadata(B, 'key') # mg.set_key(B, 'ID') # # print mg.is_dfinfo_present(B) # print mg.is_property_present_for_df(B, 'ltable') # # print mg.get_catalog_len() # # mg.del_property(B, 'key') # print mg.is_property_present_for_df(B, 'ID') # # # mg.del_all_properties(A) # print mg.get_catalog_len() A = mg.read_csv_metadata(path_for_A) print A print mg._catalog mg.set_key(A, 'ID') print 'xyx'
# coding=utf-8 import os import magellan as mg from magellan.debugmatcher.debug_gui_decisiontree_matcher import _vis_debug_dt, \ vis_tuple_debug_dt_matcher datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = mg.DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0], exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels']) # feature_table = mg.get_features_for_matching(A, B) # # labels = [0]*7 # labels.extend([1]*8)
def test_set_key_invalid_mv(): df = mg.read_csv_metadata(path_for_A_dup) status = mg.set_key(df, 'ID') assert_equal(status, False)
import magellan as mg A = mg.read_csv_metadata('../magellan/datasets/table_A.csv', key='ID') print mg.get_catalog() # mg.to_csv(A, '../magellan/datasets/A.csv') print 'Hello' B = mg.read_csv_metadata('../magellan/datasets/A.csv', key='ID') print mg.get_catalog() # filepath = '../magellan/datasets/A.metadata_' # metadata = dict() # num_lines = 0 # # num_lines = sum(1 for line in open(filepath)) # print num_lines # # if num_lines > 0: # with open(filepath) as f: # for i in range(num_lines): # line = next(f) # print line # # if line.startswith('#'): # line = line.lstrip('#') # tokens = line.split('=') # assert len(tokens) is 2, "Error in file, the num tokens is not 2" # key = tokens[0].strip() # value = tokens[1].strip() # if value is not "POINTER": # metadata[key] = value # #