import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
import sys #sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique') #sys.path.append('/scratch/pradap/python-work/enrqiue') import os import magellan as mg import jpype p = mg.get_install_path() path_for_A = os.sep.join([p, 'datasets', 'table_A.csv']) path_for_B = os.sep.join([p, 'datasets', 'table_B.csv']) # mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') jvm_path = jpype.get_default_jvm_path() if os.path.isfile(jvm_path): mg.init_jvm(jvm_path) #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') else: x = [] for t in jvm_path.split(os.sep): if t == 'client': t = 'server' elif t == 'server': r = 'client' x.append(t) jp = os.sep.join(x) if os.path.isfile(jp): mg.init_jvm(jp) else: jp = raw_input('Give path to jvm library (i.e libjvm.so in linux) : ') if os.path.isfile(jp): mg.init_jvm(jp) else: print 'Invalid path; cannot run tests; exiting'
# In[8]: import jpype # In[9]: jpype.getDefaultJVMPath() # In[10]: # Initialize JVM mg.init_jvm('C:\\Program Files\\Java\\jre7\\bin\\server\\jvm.dll') # In[11]: # import toy datasets A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') # In[12]: A # In[13]:
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
# In[7]: A.head(10) # In[8]: import jpype # In[9]: jpype.getDefaultJVMPath() # In[10]: # Initialize JVM mg.init_jvm('C:\\Program Files\\Java\\jre7\\bin\\server\\jvm.dll') # In[11]: # import toy datasets A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') # In[12]: A # In[13]: # block using zipcode ab = mg.AttrEquivalenceBlocker()
import magellan as mg A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') from magellan.feature.simfunctions import * from magellan.feature.tokenizers import * def block_fn_1(ltuple, rtuple): val = jaccard(tok_qgram(ltuple['address'], 3), tok_qgram(rtuple['address'], 3)) if val < 0.4: return True else: return False def block_fn_2(x, y): val = lev(x['name'], y['name']) if val < 0.5: return True else: return False bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn_1) C = bb.block_tables(A, B, l_output_attrs='name', r_output_attrs='name') print C bb.set_black_box_function(block_fn_2) D = bb.block_candset(C) print D