def vis_debug_rf(matcher, train, test, exclude_attrs, target_attr): """ Visual debugger for random forest matcher Parameters ---------- matcher : object, RFMatcher object train : MTable, containing training data with "True" labels test : MTable, containing test data with "True labels. The "True" labels are used for evaluation. exclude_attrs : List, attributes to be excluded from train and test, for training and testing. target_attr : String, column name in validation_set containing 'True' labels """ assert set(test.columns) == set(train.columns), "The train and test columns are not same" assert set(train.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \ "train columns" # redundant assert set(test.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \ "test columns" # fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) predict_attr_name = get_name_for_predict_column(test.columns) predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name) metric = get_metric(eval_summary) fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls']) app = mg._viewapp m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) m.show() app.exec_()
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type Decision Tree matcher') if not isinstance(target_attr, six.string_types): logger.error('Target attribute is not of type string') raise AssertionError('Target attribute is not of type string') if not check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the train table columns') if not check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the train table columns') if not check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the test table columns') if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] exclude_attrs = list_drop_duplicates(exclude_attrs) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) predict_attr_name = get_name_for_predict_column(test.columns) # predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name) metric = get_metric(eval_summary) fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls']) app = mg._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) if show_window == True: m.show() app.exec_()
def vis_debug_dt(matcher, train, test, exclude_attrs, target_attr): """ Visual debugger for decision tree matcher Parameters ---------- matcher : object, DTMatcher object train : MTable, containing training data with "True" labels test : MTable, containing test data with "True labels. The "True" labels are used for evaluation. exclude_attrs : List, attributes to be excluded from train and test, for training and testing. target_attr : String, column name in validation_set containing 'True' labels """ assert set(test.columns) == set( train.columns), "The train and test columns are not same" assert set(train.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \ "train columns" # redundant assert set(test.columns).intersection(exclude_attrs) == set(exclude_attrs), "Some of exclude attrs are not part of" \ "test columns" # fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) predict_attr_name = get_name_for_predict_column(test.columns) predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # print predicted eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name) # print eval_summary metric = get_metric(eval_summary) fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls']) # print fp_dataframe.dtypes fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls']) app = mg._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) m.show() app.exec_()
def vis_debug_rm(matcher, validation_set, target_attr, feature_table): """ Visual debugger for boolean rule-based matcher Parameters ---------- matcher : object, Boolean rule-based matcher object validation_set : MTable, used to debug target_attr : String, column name in validation_set containing 'True' labels feature_table : pandas dataframe, containing feature information """ predict_attr_name = get_name_for_predict_column(validation_set.columns) predicted = matcher.predict(validation_set, predict_attr_name, append=True, inplace=False) eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name) metric = get_metric(eval_summary) fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls']) app = mg._viewapp m = MainWindowManager(matcher, "rm", feature_table, metric, predicted, fp_dataframe, fn_dataframe) m.show() app.exec_()
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError( 'Input matcher is not of type Decision Tree matcher') if not isinstance(target_attr, six.string_types): logger.error('Target attribute is not of type string') raise AssertionError('Target attribute is not of type string') if not check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError( 'The exclude attrs are not in the train table columns') if not check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError( 'The target attr is not in the train table columns') if not check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError( 'The exclude attrs are not in the test table columns') if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] exclude_attrs = list_drop_duplicates(exclude_attrs) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) predict_attr_name = get_name_for_predict_column(test.columns) # predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name) metric = get_metric(eval_summary) fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls']) app = mg._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) if show_window == True: m.show() app.exec_()
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
#rf.fit(table=train, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], target_attr='gold') #mg.vis_debug_rf(rf, train, test, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], # target_attr='gold') # ret_val, node_list = mg.vis_tuple_debug_rf_matcher(rf, G.iloc[[0]], # exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold']) # print ret_val # print node_list rm = mg.BooleanRuleMatcher() rm.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6', 'lang_lang_lev(ltuple, rtuple) > 0.5'], feature_table=f) rm.add_rule(['author_author_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.6'], feature_table=f) rm.add_rule(['binding_binding_jac_qgm_3_qgm_3(ltuple, rtuple) > 0.5'], feature_table=f) X = rm.predict(table=L, target_attr='predicted', append=True, inplace=False) metric = mg.eval_matches(X, 'gold', 'predicted') print metric # ret_val, node_list = mg.vis_tuple_debug_rm_matcher(rm, wal.iloc[0], bwk.iloc[1], feature_table=f) # print ret_val # print node_list #mg.vis_debug_rm(rm, L, 'gold', feature_table=f) trigger = mg.MatchTrigger() trigger.add_cond_rule('title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.8', f) trigger.add_cond_status(True) trigger.add_action(0) # print P[['_id', 'gold', 'predicted']] Y = trigger.execute(X, 'predicted', inplace=False) print Y print 'Hi'