Beispiel #1
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold')
dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table)
print "Hi"
Beispiel #2
0
from magellan.matcherselection._mlmatcherselection import select_matcher_test
from magellan.matcherselection.mlmatcherselection import select_matcher

import magellan as mg
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
# L = mg.read_csv('../../magellan/testcases/debug-tests/label_ab_correct_books.csv', ltable=A, rtable=B)
L = mg.label_table(C, 'gold')
L.to_csv('mur_labels')
F = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=F, attrs_after='gold')

dt = mg.DTMatcher()

select_matcher_test(dt, table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold',
                    random_state=0)


Beispiel #3
0
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger
import magellan as mg
import pandas as pd

mg.init_jvm()
# Read walmart books data
wal =  mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv',
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                    low_memory=False, key='id')
# Read bowker books data
bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv',
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                  low_memory=False, key='id')

L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)
feature_table = mg.get_features_for_matching(wal, bwk)
f = feature_table.ix[[3,7,18,26, 53]]
m = mg.DTMatcher()


# feature_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')
G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent')
# m = mg.LinRegMatcher()
# print G
pos_trigger = mg.MatchTrigger()
pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1',
                          feature_table=feature_table)
pos_trigger.add_cond_status(True)
pos_trigger.add_action(1)
Beispiel #4
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table=S['train'],
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
dt.predict(table=S['test'],
           exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
           target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt,
             d,
             S['test'],
             exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
             feat_table=feat_table)
print "Hi"
Beispiel #5
0
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                    low_memory=False, key='id')

bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', 
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                  low_memory=False, key='id')

ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author'])


L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)

print len(L)

feat_table = mg.get_features_for_matching(wal, bwk)


f = feat_table.ix[[3,7,18,26, 53]]

G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')

dt = mg.DTMatcher()
svm = mg.SVMMatcher()
rf = mg.RFMatcher()
nb = mg.NBMatcher()
lg = mg.LogRegMatcher()


# impute values
#G.fillna(0, inplace=True)