import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
from magellan.matcherselection._mlmatcherselection import select_matcher_test from magellan.matcherselection.mlmatcherselection import select_matcher import magellan as mg mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) # L = mg.read_csv('../../magellan/testcases/debug-tests/label_ab_correct_books.csv', ltable=A, rtable=B) L = mg.label_table(C, 'gold') L.to_csv('mur_labels') F = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=F, attrs_after='gold') dt = mg.DTMatcher() select_matcher_test(dt, table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', random_state=0)
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger import magellan as mg import pandas as pd mg.init_jvm() # Read walmart books data wal = mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') # Read bowker books data bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) feature_table = mg.get_features_for_matching(wal, bwk) f = feature_table.ix[[3,7,18,26, 53]] m = mg.DTMatcher() # feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent') # m = mg.LinRegMatcher() # print G pos_trigger = mg.MatchTrigger() pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1', feature_table=feature_table) pos_trigger.add_cond_status(True) pos_trigger.add_action(1)
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author']) L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) print len(L) feat_table = mg.get_features_for_matching(wal, bwk) f = feat_table.ix[[3,7,18,26, 53]] G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') dt = mg.DTMatcher() svm = mg.SVMMatcher() rf = mg.RFMatcher() nb = mg.NBMatcher() lg = mg.LogRegMatcher() # impute values #G.fillna(0, inplace=True)