def main():
    #ltable = mg.read_csv('../datasets/magellan_builtin_test/table_A.csv', key='ID')
    #rtable = mg.read_csv('../datasets/magellan_builtin_test/table_B.csv', key='ID')
    #ltable = mg.read_csv('../datasets/books_test/bowker_final_custom_id.csv', key='id')
    #rtable = mg.read_csv('../datasets/books_test/walmart_final_custom_id.csv', key='id')
    #ltable = mg.read_csv('../datasets/books_full/bowker_final_custom_id.csv', key='id')
    #rtable = mg.read_csv('../datasets/books_full/walmart_final_custom_id.csv', key='id')
    #ltable = mg.read_csv('../datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id')
    #rtable = mg.read_csv('../datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id')
    #ltable = mg.read_csv('../datasets/books/BN.csv', key='id')
    #rtable = mg.read_csv('../datasets/books/amazon_books.csv', key='id')
    #ltable = mg.read_csv('../datasets/books/BN_small.csv', key='id')
    #rtable = mg.read_csv('../datasets/books/amazon_books_small.csv', key='id')
    #blocker = mg.AttrEquivalenceBlocker()
    #candidate_set = blocker.block_tables(ltable, rtable, 'pubYear', 'pubYear')

    '''CS784 datasets'''
    #ltable = mg.read_csv('../datasets/CS784/M_ganz/tableA.csv', key='id')
    #rtable = mg.read_csv('../datasets/CS784/M_ganz/tableB.csv', key='id')
    #candidate_set = mg.read_csv('../datasets/CS784/M_ganz/tableC.csv', key='_id', ltable=ltable, rtable=rtable)
    #outdir = '../datasets/CS784/M_ganz/topK_results/'
    dataset = 'S_hanli'
    ltable = mg.read_csv('../datasets/CS784/' + dataset + '/tableA.csv', key='id')
    rtable = mg.read_csv('../datasets/CS784/' + dataset + '/tableB.csv', key='id')
    candidate_set = mg.read_csv('../datasets/CS784/' + dataset + '/tableC.csv', key='_id', ltable=ltable, rtable=rtable)
    outdir = '../datasets/CS784/' + dataset + '/topK_results/'
    pred_table = iterative_topK_debug_blocker(ltable, rtable, candidate_set, outdir)
Esempio n. 2
0
def main():
    ltable = mg.read_csv('./datasets/magellan_builtin_test/table_A.csv', key='ID')
    rtable = mg.read_csv('./datasets/magellan_builtin_test/table_B.csv', key='ID')
    # ltable = mg.read_csv('./datasets/books_test/bowker_final_custom_id.csv', key='id')
    # rtable = mg.read_csv('./datasets/books_test/walmart_final_custom_id.csv', key='id')
    # ltable = mg.read_csv('./datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id')
    # rtable = mg.read_csv('./datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id')
    # ltable = mg.read_csv('./datasets/books/BN_small.csv', key='id')
    # rtable = mg.read_csv('./datasets/books/amazon_books_small.csv', key='id')
    blocker = mg.AttrEquivalenceBlocker()
    candidate_set = blocker.block_tables(ltable, rtable, 'address', 'address')
    pred_table = debug_blocker(ltable, rtable, candidate_set)
    print pred_table
Esempio n. 3
0
def load_dataset(filename, key=None):
    p = get_install_path()
    p = os.sep.join([p, 'datasets', filename + '.csv'])
    if filename is 'table_A' or 'table_B':
        key = 'ID'
    df = read_csv(p, key=key)
    return df
Esempio n. 4
0
def load_dataset(filename, key=None):
    p = get_install_path()
    p = os.sep.join([p, 'datasets', filename + '.csv'])
    if filename is 'table_A' or 'table_B':
        key = 'ID'
    df = read_csv(p, key=key)
    return df
def main():
    #ltable = mg.read_csv('../datasets/magellan_builtin_test/table_A.csv', key='ID')
    #rtable = mg.read_csv('../datasets/magellan_builtin_test/table_B.csv', key='ID')
    ltable = mg.read_csv('../datasets/books_test/bowker_final_custom_id.csv', key='id')
    rtable = mg.read_csv('../datasets/books_test/walmart_final_custom_id.csv', key='id')
    #ltable = mg.read_csv('../datasets/books_full/bowker_final_custom_id.csv', key='id')
    #rtable = mg.read_csv('../datasets/books_full/walmart_final_custom_id.csv', key='id')
    #ltable = mg.read_csv('../datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id')
    #rtable = mg.read_csv('../datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id')
    #ltable = mg.read_csv('../datasets/books/BN.csv', key='id')
    #rtable = mg.read_csv('../datasets/books/amazon_books.csv', key='id')
    #ltable = mg.read_csv('../datasets/books/BN_small.csv', key='id')
    #rtable = mg.read_csv('../datasets/books/amazon_books_small.csv', key='id')
    #blocker = mg.AttrEquivalenceBlocker()
    #candidate_set = blocker.block_tables(ltable, rtable, 'pubYear', 'pubYear')
    candidate_set = MTable()
    pred_table = iterative_topK_debug_blocker(ltable, rtable, candidate_set)
Esempio n. 6
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold')
dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table)
print "Hi"
Esempio n. 7
0
import pandas as pd
import magellan as mg

import sys
sys.path.append('C:\Pradap\Research\Python-work\Saranam\magellan')

A = mg.read_csv('../magellan/data/toy/table_A.csv', key='ID')
B = mg.read_csv('../magellan/data/toy/table_B.csv', key='ID')

blocker = mg.AttrEquivalenceBlocker()
C = blocker.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address', 'hourly_wage'], ['name', 'address', 'hourly_wage'])


D = mg.block_union_combine([C, C])

S = mg.sample_one_table(D, 10)

#L = mg.label(S, 'gold_label')

#print mg._m_global_tokenizers
#print mg._m_global_sim_fns

t = mg.get_single_arg_tokenizers()
print t
s = mg.get_sim_funs()
print s
corres = mg.get_attr_corres(A, B)
print corres['corres']
t_1 = mg.get_attr_types(A)
print t_1
t_2 = mg.get_attr_types(B)
Esempio n. 8
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table=S['train'],
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
dt.predict(table=S['test'],
           exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
           target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt,
             d,
             S['test'],
             exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
             feat_table=feat_table)
print "Hi"
Esempio n. 9
0
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger
import magellan as mg
import pandas as pd

mg.init_jvm()
# Read walmart books data
wal =  mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv',
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                    low_memory=False, key='id')
# Read bowker books data
bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv',
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                  low_memory=False, key='id')

L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)
feature_table = mg.get_features_for_matching(wal, bwk)
f = feature_table.ix[[3,7,18,26, 53]]
m = mg.DTMatcher()


# feature_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')
G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent')
# m = mg.LinRegMatcher()
# print G
pos_trigger = mg.MatchTrigger()
pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1',
                          feature_table=feature_table)
pos_trigger.add_cond_status(True)
pos_trigger.add_action(1)
Esempio n. 10
0
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger
import magellan as mg
A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
mg.init_jvm()
#L = mg.label_table(C, 'gold')
#L.to_csv('label.csv')
L = mg.read_csv('label.csv', ltable=A, rtable=B)
feature_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feature_table, attrs_after='gold')
m = mg.LinRegMatcher()
t = cv_matcher_and_trigger(
    m,
    None,
    table=G,
    exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
    target_attr='gold',
    k=5,
    metric='precision',
    random_state=0)

res = mg.select_matcher(
    [m],
    table=G,
    exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
    target_attr='gold',
    k=5,
    metric='f1',
    random_state=0)
Esempio n. 11
0
import magellan as mg
import pandas as pd
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
mg.init_jvm()
#not reqd
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address'], ['name', 'address'])

L = mg.read_csv('label_ab_correct_labels.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
f = feat_table.ix[[9, 10, 17]]
G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')

dt = mg.DTMatcher()
dt.fit(table=G,  exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold')
t = dt.predict(table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], append=True, inplace=False,
               target_attr='predicted')


# ret_val, node_list = mg.vis_tuple_debug_dt_matcher(dt, G.ix[0],
#                                   exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], ensemble_flag=False)


# print ret_val
# print node_list


Esempio n. 12
0
import sys
sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique/')
import magellan as mg
import pandas as pd
mg.init_jvm()
wal =  mg.read_csv(mg.get_install_path() + '/datasets/books/walmart.csv',
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                    low_memory=False, key='id')

bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', 
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                  low_memory=False, key='id')

ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author'])


L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)

print len(L)

feat_table = mg.get_features_for_matching(wal, bwk)


f = feat_table.ix[[3,7,18,26, 53]]

G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')

dt = mg.DTMatcher()
svm = mg.SVMMatcher()
Esempio n. 13
0
import magellan as mg
import pandas as pd
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
mg.init_jvm()
#not reqd
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address'],
                    ['name', 'address'])

L = mg.read_csv('label_ab_correct_labels.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
f = feat_table.ix[[9, 10, 17]]
G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')

dt = mg.DTMatcher()
dt.fit(table=G,
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
t = dt.predict(table=G,
               exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
               append=True,
               inplace=False,
               target_attr='predicted')

# ret_val, node_list = mg.vis_tuple_debug_dt_matcher(dt, G.ix[0],
#                                   exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], ensemble_flag=False)

# print ret_val