def get_field_correspondence_list(ltable, rtable, field_corres_list): corres_list = [] if field_corres_list is None or len(field_corres_list) == 0: corres_list = mg.get_attr_corres(ltable, rtable)['corres'] if len(corres_list) == 0: raise StandardError('Error: the field correspondence list is empty. Nothing can be done!') else: #corres_list = field_corres_list for tu in field_corres_list: corres_list.append(tu) return corres_list
def get_field_correspondence_list(ltable, rtable, field_corres_list): corres_list = [] if field_corres_list is None or len(field_corres_list) == 0: corres_list = mg.get_attr_corres(ltable, rtable)['corres'] if len(corres_list) == 0: raise StandardError( 'Error: the field correspondence list is empty. Nothing can be done!' ) else: #corres_list = field_corres_list for tu in field_corres_list: corres_list.append(tu) return corres_list
def get_field_correspondence_list(ltable, rtable, lkey, rkey, attr_corres): corres_list = [] if attr_corres is None or len(attr_corres) == 0: corres_list = mg.get_attr_corres(ltable, rtable)['corres'] if len(corres_list) == 0: raise AssertionError('Error: the field correspondence list' ' is empty. Please specify the field' ' correspondence!') else: for tu in attr_corres: corres_list.append(tu) key_pair = (lkey, rkey) if key_pair not in corres_list: corres_list.append(key_pair) return corres_list
def get_features_for_matching(A, B): """ Get features with minimal input Parameters ---------- A, B : MTable, Input tables Returns ------- feature_table : pandas DataFrame Consists of following columns * feature_name - string, feature name * left_attribute - string, attribute name * right_attribute - string, attribute name * left_attr_tokenizer - string, tokenizer name * right_attr_tokenizer - string, tokenizer name * simfunction - string, sumilarity function name * function - function object * function_source - string, containing source code Notes ----- The function also exports the important variables such as: _match_t, _match_s, _atypes1, _atypes2, _match_c; to global name space so if a user want to examine they can do so. """ sim = mg.get_sim_funs() tok = mg.get_single_arg_tokenizers() t_A = mg.get_attr_types(A) t_B = mg.get_attr_types(B) attr_corres = mg.get_attr_corres(A, B) feat_table = get_features(A, B, t_A, t_B, attr_corres, tok, sim) # export important variables to global name space #_m_current_tokenizers, _m_current_sim_funs, _m_current_attr_types_ltable, _m_current_attr_types_rtable_m_current_corres mg._match_t = tok mg._match_s = sim mg._atypes1 = t_A mg._atypes2 = t_B mg._match_c = attr_corres return feat_table
import sys sys.path.append('C:\Pradap\Research\Python-work\Saranam\magellan') A = mg.read_csv('../magellan/data/toy/table_A.csv', key='ID') B = mg.read_csv('../magellan/data/toy/table_B.csv', key='ID') blocker = mg.AttrEquivalenceBlocker() C = blocker.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address', 'hourly_wage'], ['name', 'address', 'hourly_wage']) D = mg.block_union_combine([C, C]) S = mg.sample_one_table(D, 10) #L = mg.label(S, 'gold_label') #print mg._m_global_tokenizers #print mg._m_global_sim_fns t = mg.get_single_arg_tokenizers() print t s = mg.get_sim_funs() print s corres = mg.get_attr_corres(A, B) print corres['corres'] t_1 = mg.get_attr_types(A) print t_1 t_2 = mg.get_attr_types(B) print t_2