コード例 #1
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
     l_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     r_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'id')
         C = ab.block_tables(A, B, 'city_posted', 'city_posted',
                             l_output_attrs, r_output_attrs)
         self.D = ab.block_candset(C, 'model_year', 'model_year')
         bb.set_black_box_function(_bikes_function)
     except AssertionError:
         print("Dataset \'bikes\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #2
0
ファイル: mur_test.py プロジェクト: kvpradap/dmagellan
def _get_stop_words():
    stop_words_set = set()
    install_path = em.get_install_path()
    dataset_path = os.sep.join([install_path, 'utils'])
    stop_words_file = os.sep.join([dataset_path, 'stop_words.txt'])
    with open(stop_words_file, "rb") as stopwords_file:
        for stop_words in stopwords_file:
            stop_words_set.add(stop_words.rstrip())

    return stop_words_set
コード例 #3
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     self.A = mg.read_csv_metadata(path_for_A)
     try:
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.l_output_attrs = ['Brand', 'Amazon_Price']
         self.r_output_attrs = ['Brand', 'Price']
         bb.set_black_box_function(_electronics_function)
     except AssertionError:
         print(
             "Dataset \'electronics\' not found. Please visit the project "
             "website to download the dataset.")
         raise SystemExit
コード例 #4
0
    def setup(self):
        p = mg.get_install_path()
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'id')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'id')
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")        
            raise SystemExit

        self.l_block_attr = 'city_posted'
        self.r_block_attr = 'city_posted'
        self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']
        self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']
コード例 #5
0
    def setup(self):
        p = mg.get_install_path()
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'id')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'id')
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit

        self.l_block_attr = 'city_posted'
        self.r_block_attr = 'city_posted'
        self.l_output_attrs = [
            'bike_name', 'city_posted', 'km_driven', 'price', 'color',
            'model_year'
        ]
        self.r_output_attrs = [
            'bike_name', 'city_posted', 'km_driven', 'price', 'color',
            'model_year'
        ]
コード例 #6
0
import os
from nose.tools import *
import pandas as pd
import unittest

import py_entitymatching as em

p = em.get_install_path()
path_a = os.sep.join([p, 'tests', 'test_datasets', 'A.csv'])
path_b = os.sep.join([p, 'tests', 'test_datasets', 'B.csv'])
l_block_attr_1 = 'zipcode'
l_block_attr_2 = 'birth_year'
l_block_attr_3 = 'name'
r_block_attr_1 = 'zipcode'
r_block_attr_2 = 'birth_year'
r_block_attr_3 = 'name'
l_output_attrs = ['zipcode', 'birth_year']
r_output_attrs = ['zipcode', 'birth_year']
l_output_prefix = 'l_'
r_output_prefix = 'r_'

# attribute equivalence on [l|r]_block_attr_1
expected_ids_1 = [('a1', 'b1'), ('a1', 'b2'), ('a1', 'b6'),
                  ('a2', 'b3'), ('a2', 'b4'), ('a2', 'b5'),
                  ('a3', 'b1'), ('a3', 'b2'), ('a3', 'b6'),
                  ('a4', 'b3'), ('a4', 'b4'), ('a4', 'b5'),
                  ('a5', 'b3'), ('a5', 'b4'), ('a5', 'b5')]

# attribute equivalence on [l|r]_block_attr_1 \intersection [l|r]_block_attr_2
expected_ids_2 = [('a2', 'b3'), ('a3', 'b2'), ('a5', 'b5')]
コード例 #7
0
import os
from nose.tools import *
import pandas as pd
import unittest

import py_entitymatching as em

p = em.get_install_path()
path_a = os.sep.join([p, 'tests', 'test_datasets', 'A.csv'])
path_b = os.sep.join([p, 'tests', 'test_datasets', 'B.csv'])
l_overlap_attr_1 = 'name'
l_overlap_attr_2 = 'address'
r_overlap_attr_1 = 'name'
r_overlap_attr_2 = 'address'
l_output_attrs = ['name', 'address']
r_output_attrs = ['name', 'address']
l_output_prefix = 'l_'
r_output_prefix = 'r_'

# overlap on [r,l]_overlap_attr_1 with overlap_size=1
expected_ids_1 = [('a2', 'b3'), ('a2', 'b6'), ('a3', 'b2'), ('a5', 'b5')]

# overlap on [r,l]_overlap_attr_2 with overlap_size=4
expected_ids_2 = [('a2', 'b3'), ('a3', 'b2')]

# overlap on birth_year with q_val=3, overlap_size=2 (no padding) =6 (padding)
expected_ids_3 = [('a2', 'b3'), ('a3', 'b2'), ('a4', 'b1'), ('a4', 'b6'),
                  ('a5', 'b5')]

# block tables on [l|r]_overlap_attr_2, block candset on [l|r]overlap_attr_3
expected_ids_2_and_3 = [('a2', 'b3'), ('a3', 'b2')]
コード例 #8
0
# Write the benchmarking functions here.
# See "Writing benchmarks" in the asv docs for more information.

import os
import sys

import py_entitymatching as mg

p = mg.get_install_path()
datasets_path = os.sep.join([p, 'datasets', 'example_datasets'])
ab = mg.AttrEquivalenceBlocker()


class TimeBlockTablesAnime:
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
        self.l_block_attr = 'Year'
        self.r_block_attr = 'Year'
        self.l_output_attrs = ['Title', 'Year', 'Episodes']
        self.r_output_attrs = ['Title', 'Year', 'Episodes']
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'ID')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'ID')
        except AssertionError:
            print("Dataset \'anime\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit
コード例 #9
0
import os

from dask import get
from dmagellan.feature.extractfeatures import extract_feature_vecs
from dmagellan.feature.autofeaturegen import get_features_for_matching
from dmagellan.matcher.dtmatcher import DTMatcher
from dmagellan.matcher.svmmatcher import SVMMatcher
from dmagellan.matcher.rfmatcher import RFMatcher
from dmagellan.matcher.logregmatcher import LogRegMatcher
from dmagellan.matcher.nbmatcher import NBMatcher
from dmagellan.matcher.linregmatcher import LinRegMatcher

from dmagellan.mlmatcherselection.mlmatcherselection import select_matcher

# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'

A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data,
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')
# Split S into I an J
コード例 #10
0
ファイル: running_magellan.py プロジェクト: JRWu/cs848w20
#!/bin/python
# Invoke this script from /root

import sys
sys.path.append('/magellan/py_entitymatching/py_entitymatching/')

import py_entitymatching as em
import pandas as pd
import os

path_A = em.get_install_path(
) + os.sep + 'datasets' + os.sep + 'end-to-end' + os.sep + 'restaurants/fodors.csv'
path_B = em.get_install_path(
) + os.sep + 'datasets' + os.sep + 'end-to-end' + os.sep + 'restaurants/zagats.csv'
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')

print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' +
      str(len(A) * len(B)))

ob = em.OverlapBlocker()
C = ob.block_tables(A,
                    B,
                    'name',
                    'name',
                    l_output_attrs=['name', 'addr', 'city', 'phone'],
                    r_output_attrs=['name', 'addr', 'city', 'phone'],
                    overlap_size=1,
                    show_progress=False)
コード例 #11
0
# Write the benchmarking functions here.                                        
# See "Writing benchmarks" in the asv docs for more information.

import os
import sys

import py_entitymatching  as mg

p = mg.get_install_path()
datasets_path = os.sep.join([p, 'datasets', 'example_datasets'])
snb = mg.SortedNeighborhoodBlocker()


class TimeBlockTablesAnime:
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
        self.l_block_attr = 'Year'
        self.r_block_attr = 'Year'
        self.l_output_attrs = ['Title', 'Year', 'Episodes']
        self.r_output_attrs = ['Title', 'Year', 'Episodes']
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'ID')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'ID')
        except AssertionError:
            print("Dataset \'anime\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit