A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) mg.init_jvm() #L = mg.label_table(C, 'gold') #L.to_csv('label.csv') L = mg.read_csv('label.csv', ltable=A, rtable=B) feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feature_table, attrs_after='gold') m = mg.LinRegMatcher() t = cv_matcher_and_trigger( m, None, table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='precision', random_state=0) res = mg.select_matcher( [m], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='f1', random_state=0) print res['cv_stats']
s_prime # In[35]: # Fitting/Predicting # Create a set of matchers nb = mg.NBMatcher() # naive bayes dt = mg.DTMatcher() # decision tree rf = mg.RFMatcher() # random forest # In[36]: # Select a matcher using cross validation m = mg.select_matcher([nb, dt, rf], x=s_prime[list(feat_table['feature_name'])], y=s_prime['gold_label'], k=5 ) # In[37]: # see what was selected and the stats m # In[38]: # instead of a single matcher, we can choose ensemble of matchers mc, stats = mg.selector_matcher_combiner([nb, dt, rf], ['majority'], x=s_prime[list(feat_table['feature_name'])], y=s_prime['gold_label'], k=5) # In[39]:
# display feature vector table s_prime # In[35]: # Fitting/Predicting # Create a set of matchers nb = mg.NBMatcher() # naive bayes dt = mg.DTMatcher() # decision tree rf = mg.RFMatcher() # random forest # In[36]: # Select a matcher using cross validation m = mg.select_matcher([nb, dt, rf], x=s_prime[list(feat_table['feature_name'])], y=s_prime['gold_label'], k=5) # In[37]: # see what was selected and the stats m # In[38]: # instead of a single matcher, we can choose ensemble of matchers mc, stats = mg.selector_matcher_combiner([nb, dt, rf], ['majority'], x=s_prime[list( feat_table['feature_name'])], y=s_prime['gold_label'], k=5)