A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) mg.init_jvm() #L = mg.label_table(C, 'gold') #L.to_csv('label.csv') L = mg.read_csv('label.csv', ltable=A, rtable=B) feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feature_table, attrs_after='gold') m = mg.LinRegMatcher() t = cv_matcher_and_trigger( m, None, table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='precision', random_state=0) res = mg.select_matcher( [m], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='f1', random_state=0) print res['cv_stats']
G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent') # m = mg.LinRegMatcher() # print G pos_trigger = mg.MatchTrigger() pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1', feature_table=feature_table) pos_trigger.add_cond_status(True) pos_trigger.add_action(1) neg_trigger = mg.MatchTrigger() neg_trigger.add_cond_rule(['lang_lang_lev(ltuple, rtuple) > 0.5'], feature_table=feature_table) neg_trigger.add_cond_status(False) neg_trigger.add_action(0) t = cv_matcher_and_trigger(m, [pos_trigger], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric=['f1'], random_state=1) # Create rule-based matcher and add rules. # rm = mg.BooleanRuleMatcher() # rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6' # ], # feature_table=feature_table) # rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'], # feature_table=f) # rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'], # feature_table=f) # # X = rm.predict(table=L, target_attr='predicted', # append=True, inplace=False) # eval_summary = mg.eval_matches(X, 'gold', 'predicted') # q = cv_matcher_and_trigger(rm, [neg_trigger], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
pos_trigger = mg.MatchTrigger() pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1', feature_table=feature_table) pos_trigger.add_cond_status(True) pos_trigger.add_action(1) neg_trigger = mg.MatchTrigger() neg_trigger.add_cond_rule(['lang_lang_lev(ltuple, rtuple) > 0.5'], feature_table=feature_table) neg_trigger.add_cond_status(False) neg_trigger.add_action(0) t = cv_matcher_and_trigger( m, [pos_trigger], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric=['f1'], random_state=1) # Create rule-based matcher and add rules. # rm = mg.BooleanRuleMatcher() # rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6' # ], # feature_table=feature_table) # rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'], # feature_table=f) # rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'], # feature_table=f) # # X = rm.predict(table=L, target_attr='predicted', # append=True, inplace=False)