コード例 #1
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join(
         [p, 'datasets', 'example_datasets', 'bikes', 'A.csv'])
     path_for_B = os.sep.join(
         [p, 'datasets', 'example_datasets', 'bikes', 'B.csv'])
     l_key = 'id'
     r_key = 'id'
     self.A = mg.read_csv_metadata(path_for_A)
     mg.set_key(self.A, l_key)
     self.B = mg.read_csv_metadata(path_for_B)
     mg.set_key(self.B, r_key)
     l_block_attr_1 = 'city_posted'
     r_block_attr_1 = 'city_posted'
     l_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     r_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     self.ab = mg.AttrEquivalenceBlocker()
     self.C = self.ab.block_tables(self.A,
                                   self.B,
                                   l_block_attr_1,
                                   r_block_attr_1,
                                   l_output_attrs,
                                   r_output_attrs,
                                   verbose=False)
     self.l_block_attr = 'model_year'
     self.r_block_attr = 'model_year'
コード例 #2
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'A.csv'])
     path_for_B = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'B.csv'])
     l_key = 'id'
     r_key = 'id'
     self.A = mg.read_csv_metadata(path_for_A)
     mg.set_key(self.A, l_key)
     self.B = mg.read_csv_metadata(path_for_B)
     mg.set_key(self.B, r_key)
     self.l_block_attr = 'city_posted'
     self.r_block_attr = 'city_posted'
     self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                            'color', 'model_year']
     self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                            'color', 'model_year']
     self.ab = mg.AttrEquivalenceBlocker()
コード例 #3
0
import sys
#sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique')
#sys.path.append('/scratch/pradap/python-work/enrqiue')
import os
import magellan as mg
import jpype
p = mg.get_install_path()
path_for_A = os.sep.join([p, 'datasets', 'table_A.csv'])
path_for_B = os.sep.join([p, 'datasets', 'table_B.csv'])
# mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
jvm_path = jpype.get_default_jvm_path()
if os.path.isfile(jvm_path):
    mg.init_jvm(jvm_path)
    #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
else:
    x = []
    for t in jvm_path.split(os.sep):
        if t == 'client':
            t = 'server'
        elif t == 'server':
            r = 'client'
        x.append(t)
    jp = os.sep.join(x)
    if os.path.isfile(jp):
        mg.init_jvm(jp)
    else:
        jp = raw_input('Give path to jvm library (i.e libjvm.so in linux) : ')
        if os.path.isfile(jp):
            mg.init_jvm(jp)
        else:
            print 'Invalid path; cannot run tests; exiting'
コード例 #4
0
ファイル: test_vis_debug_dt.py プロジェクト: paulgc/magellan
# coding=utf-8
import os

import magellan as mg
from magellan.debugmatcher.debug_gui_decisiontree_matcher import _vis_debug_dt, \
    vis_tuple_debug_dt_matcher

datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets'])
path_c = os.sep.join([datasets_path, 'C.csv'])
A = mg.load_dataset('table_A', key='ID')
B = mg.load_dataset('table_B', key='ID')
C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)

labels = [0] * 7
labels.extend([1] * 8)
C['labels'] = labels

feature_table = mg.get_features_for_matching(A, B)
feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
                                       attrs_after='labels')

dt = mg.DTMatcher()
dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
       target_attr='labels')
vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0],
                           exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

# feature_table = mg.get_features_for_matching(A, B)
#
# labels = [0]*7
# labels.extend([1]*8)
コード例 #5
0
ファイル: test_delfiles.py プロジェクト: paulgc/magellan
import os
import magellan as mg

from magellan.utils.generic_helper import del_files_in_dir
p = os.sep.join([mg.get_install_path(), 'datasets','test_datasets', 'sandbox'])

del_files_in_dir(p)
コード例 #6
0
ファイル: mur_test2.py プロジェクト: Yashg19/enrique
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger
import magellan as mg
import pandas as pd

mg.init_jvm()
# Read walmart books data
wal =  mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv',
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                    low_memory=False, key='id')
# Read bowker books data
bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv',
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},
                  low_memory=False, key='id')

L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)
feature_table = mg.get_features_for_matching(wal, bwk)
f = feature_table.ix[[3,7,18,26, 53]]
m = mg.DTMatcher()


# feature_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')
G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent')
# m = mg.LinRegMatcher()
# print G
pos_trigger = mg.MatchTrigger()
pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1',
                          feature_table=feature_table)
pos_trigger.add_cond_status(True)
pos_trigger.add_action(1)
コード例 #7
0
import os
import magellan as mg

from magellan.utils.generic_helper import del_files_in_dir
p = os.sep.join(
    [mg.get_install_path(), 'datasets', 'test_datasets', 'sandbox'])

del_files_in_dir(p)
コード例 #8
0
import sys
sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique/')
import magellan as mg
import pandas as pd
mg.init_jvm()
wal =  mg.read_csv(mg.get_install_path() + '/datasets/books/walmart.csv',
                    dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                    low_memory=False, key='id')

bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', 
                  dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str},  
                  low_memory=False, key='id')

ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author'])


L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk)

print len(L)

feat_table = mg.get_features_for_matching(wal, bwk)


f = feat_table.ix[[3,7,18,26, 53]]

G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold')

dt = mg.DTMatcher()
svm = mg.SVMMatcher()
コード例 #9
0
ファイル: test_labeler.py プロジェクト: paulgc/magellan
import magellan as mg
import pandas as pd
import os
from PyQt4 import QtCore
datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets'])


path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

A = mg.read_csv_metadata(path_a)
B = mg.read_csv_metadata(path_b, key='ID')
C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)

D = mg.label_table(C, 'label')

print(D)
# timer = QtCore.QTimer()
# timer.setInterval(2000) # 2 seconds
# mg._viewapp.loadFinished.connect(timer.start)
# timer.timeout.connect(mg._viewapp.quit)
コード例 #10
0
# coding=utf-8
import logging
import os

import magellan as mg

logging.basicConfig(level=logging.DEBUG)
datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
path_a = os.sep.join([datasets_path, 'ACM_demo.csv'])
path_b = os.sep.join([datasets_path, 'DBLP_demo.csv'])
path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv'])

A = mg.read_csv_metadata(path_a, key='id')
B = mg.read_csv_metadata(path_b, key='id')
C = mg.read_csv_metadata(path_c, ltable=B, rtable=A, fk_ltable='ltable.id', fk_rtable='rtable.id', key='_id')

feature_table = mg.get_features_for_matching(A, B)
feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold', verbose=True)
# dtmatcher = mg.DTMatcher()
# nbmatcher = mg.NBMatcher()
# rfmatcher = mg.RFMatcher()
# svmmatcher = mg.SVMMatcher()
# linregmatcher = mg.LinRegMatcher()
# logregmatcher = mg.LogRegMatcher()