datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = mg.DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0], exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels']) # feature_table = mg.get_features_for_matching(A, B) # # labels = [0]*7 # labels.extend([1]*8) # C['labels'] = labels # # feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, # attrs_after='labels') # #
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
s_prime = mg.extract_feature_vecs(L, attrs_before=['ltable.name', 'rtable.name'], feature_table=feat_table, attrs_after=['gold_label']) # In[34]: # display feature vector table s_prime # In[35]: # Fitting/Predicting # Create a set of matchers nb = mg.NBMatcher() # naive bayes dt = mg.DTMatcher() # decision tree rf = mg.RFMatcher() # random forest # In[36]: # Select a matcher using cross validation m = mg.select_matcher([nb, dt, rf], x=s_prime[list(feat_table['feature_name'])], y=s_prime['gold_label'], k=5) # In[37]: # see what was selected and the stats m