def _get_stop_words(): stop_words_set = set() install_path = get_install_path() dataset_path = os.sep.join([install_path, 'utils']) stop_words_file = os.sep.join([dataset_path, 'stop_words.txt']) with open(stop_words_file, "rb") as stopwords_file: for stop_words in stopwords_file: stop_words_set.add(stop_words.rstrip()) return stop_words_set
import pandas as pd import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.matcher.matcherutils as mu from py_entitymatching.debugmatcher.debug_gui_randomforest_matcher import _vis_debug_rf, \ vis_tuple_debug_rf_matcher from py_entitymatching.debugmatcher.debug_randomforest_matcher import debug_randomforest_matcher from py_entitymatching.feature.autofeaturegen import get_features_for_matching from py_entitymatching.feature.extractfeatures import extract_feature_vecs from py_entitymatching.io.parsers import read_csv_metadata from py_entitymatching.matcher.rfmatcher import RFMatcher from py_entitymatching.utils.generic_helper import get_install_path datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) class VisRFDebugMatcherTestCases(unittest.TestCase): def setUp(self): cm.del_catalog() def tearDown(self): cm.del_catalog() def test_vis_debug_matcher_rf_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID')
import six from py_entitymatching.utils.generic_helper import get_install_path, list_diff from py_entitymatching.io.parsers import read_csv_metadata from py_entitymatching.matcherselector.mlmatcherselection import select_matcher from py_entitymatching.matcher.dtmatcher import DTMatcher from py_entitymatching.matcher.linregmatcher import LinRegMatcher from py_entitymatching.matcher.logregmatcher import LogRegMatcher from py_entitymatching.matcher.nbmatcher import NBMatcher from py_entitymatching.matcher.rfmatcher import RFMatcher from py_entitymatching.matcher.svmmatcher import SVMMatcher import py_entitymatching.catalog.catalog_manager as cm datasets_path = os.sep.join( [get_install_path(), 'tests', 'test_datasets', 'matcherselector']) path_a = os.sep.join([datasets_path, 'DBLP_demo.csv']) path_b = os.sep.join([datasets_path, 'ACM_demo.csv']) path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv']) path_f = os.sep.join([datasets_path, 'feat_vecs.csv']) class MLMatcherSelectionTestCases(unittest.TestCase): def setUp(self): cm.del_catalog() def tearDown(self): cm.del_catalog() # @nottest def test_select_matcher_valid_1(self):
import pandas as pd import six from contextlib import contextmanager from py_entitymatching.utils.generic_helper import get_install_path from py_entitymatching.io.parsers import read_csv_metadata from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching import py_entitymatching.feature.autofeaturegen as afg import py_entitymatching.feature.attributeutils as au import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.feature.simfunctions as simfuncs import py_entitymatching.feature.tokenizers as toks datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) bc_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'blockercombiner']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) # proxy for user input. Since we use six, the way to proxy the input is to use the moves module. # Specifically, for input we had to replace input with a function that will return the desired output @contextmanager def mockInput(mock): original_input = six.moves.input six.moves.input = lambda _: mock yield six.moves.input = original_input class AutoFeatureGenerationTestCases(unittest.TestCase):
# coding=utf-8 import sys import py_entitymatching import os from nose.tools import * import unittest import pandas as pd import six from py_entitymatching.utils.generic_helper import get_install_path from py_entitymatching.sampler.down_sample import _inv_index, _probe_index, down_sample, _get_str_cols_list import py_entitymatching.catalog.catalog_manager as cm from py_entitymatching.io.parsers import read_csv_metadata datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) path_a = os.sep.join([datasets_path, 'restA.csv']) path_b = os.sep.join([datasets_path, 'restB.csv']) class DownSampleTestCases(unittest.TestCase): def setUp(self): self.A = read_csv_metadata(path_a, key='ID') self.B = read_csv_metadata(path_b, key='ID') def tearDown(self): del self.A del self.B def test_down_sample_table_valid_1(self): C, D = down_sample(self.A, self.B, 100, 10) self.assertEqual(len(D), 100)
import pandas as pd import six from contextlib import contextmanager from py_entitymatching.utils.generic_helper import get_install_path from py_entitymatching.io.parsers import read_csv_metadata from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching import py_entitymatching.feature.autofeaturegen as afg import py_entitymatching.feature.attributeutils as au import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.feature.simfunctions as simfuncs import py_entitymatching.feature.tokenizers as toks datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) bc_datasets_path = os.sep.join( [get_install_path(), 'tests', 'test_datasets', 'blockercombiner']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) # proxy for user input. Since we use six, the way to proxy the input is to use the moves module. # Specifically, for input we had to replace input with a function that will return the desired output @contextmanager def mockInput(mock): original_input = six.moves.input six.moves.input = lambda _: mock yield six.moves.input = original_input
import py_entitymatching as em from py_entitymatching.utils.generic_helper import get_install_path import py_entitymatching.catalog.catalog_manager as cm import py_entitymatching.utils.catalog_helper as ch from py_entitymatching.io.parsers import read_csv_metadata #import sys #sys.path.insert(0, '../debugblocker') #import debugblocker as db import py_entitymatching.debugblocker.debugblocker as db from operator import itemgetter from array import array datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) catalog_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'catalog']) debugblocker_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'debugblocker']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) class DebugblockerTestCases(unittest.TestCase): def test_validate_types_1(self): A = read_csv_metadata(path_a, key='ID') B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID', key = '_id') A_key = em.get_key(A) B_key = em.get_key(B)
# coding=utf-8 from __future__ import unicode_literals import os import unittest import pandas as pd from nose.tools import raises from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file from py_entitymatching.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists import py_entitymatching.catalog.catalog_manager as cm datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) io_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'io']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) sndbx_path = os.sep.join([os.sep.join([get_install_path(), 'tests', 'test_datasets']), 'sandbox']) class ReadCSVMetadataTestCases(unittest.TestCase): def test_valid_path_wi_valid_metadata(self): cm.del_catalog() A = read_csv_metadata(path_a) pd_A = pd.read_csv(path_a) self.assertEqual(A.equals(pd_A), True) self.assertEqual(cm.get_key(A), 'ID') def test_valid_path_candset_wi_valid_metadata(self): cm.del_catalog() A = read_csv_metadata(path_a)
# coding=utf-8 from __future__ import unicode_literals import os import unittest import pandas as pd from nose.tools import raises from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file from py_entitymatching.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists import py_entitymatching.catalog.catalog_manager as cm datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) io_datasets_path = os.sep.join( [get_install_path(), 'tests', 'test_datasets', 'io']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) sndbx_path = os.sep.join( [os.sep.join([get_install_path(), 'tests', 'test_datasets']), 'sandbox']) class ReadCSVMetadataTestCases(unittest.TestCase): def test_valid_path_wi_valid_metadata(self): cm.del_catalog() A = read_csv_metadata(path_a) pd_A = pd.read_csv(path_a) self.assertEqual(A.equals(pd_A), True) self.assertEqual(cm.get_key(A), 'ID') def test_valid_path_candset_wi_valid_metadata(self): cm.del_catalog()
import six from py_entitymatching.matcher.dtmatcher import DTMatcher from py_entitymatching.matcher.linregmatcher import LinRegMatcher from py_entitymatching.matcher.logregmatcher import LogRegMatcher from py_entitymatching.matcher.nbmatcher import NBMatcher from py_entitymatching.matcher.rfmatcher import RFMatcher from py_entitymatching.matcher.svmmatcher import SVMMatcher from py_entitymatching.io.parsers import read_csv_metadata import py_entitymatching.matcher.matcherutils as mu import py_entitymatching.catalog.catalog_manager as cm from py_entitymatching.utils.generic_helper import get_install_path, list_diff datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) feat_datasets_path = os.sep.join( [get_install_path(), 'tests', 'test_datasets', 'matcherselector']) fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv']) fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv']) fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv']) fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv']) class MLMatcherTestCases(unittest.TestCase): def test_valid_names_for_matchers(self): matchers1 = {
# Write the benchmarking functions here. # See "Writing benchmarks" in the asv docs for more information. import os import py_entitymatching as em from py_entitymatching.utils.generic_helper import get_install_path import sys if sys.version[0] == '2': reload(sys) sys.setdefaultencoding("utf-8") PATH = get_install_path() DATASET_PATH = os.sep.join([PATH, 'datasets', 'example_datasets']) class TimeDownSampleRestaurants: def setup(self): path_for_a = os.sep.join([DATASET_PATH, 'restaurants', 'A.csv']) path_for_b = os.sep.join([DATASET_PATH, 'restaurants', 'B.csv']) try: self.A = em.read_csv_metadata(path_for_a) self.B = em.read_csv_metadata(path_for_b) self.size = 500 self.y_param = 2 except AssertionError: print("Dataset \'restaurants\' not found. Please visit the project website to download the dataset.") raise SystemExit