def test_single_sorting_key(self): """SNI: Test class arguments.""" # all the following cases return in the same index. # situation 1 index_cl1 = recordlinkage.SortedNeighbourhoodIndex('var_arange') pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = recordlinkage.SortedNeighbourhoodIndex(on='var_arange') pairs2 = index_cl2.index((self.a, self.b)) # situation 3 index_cl3 = recordlinkage.SortedNeighbourhoodIndex( left_on='var_arange', right_on='var_arange') pairs3 = index_cl3.index((self.a, self.b)) # situation 4 index_cl4 = recordlinkage.SortedNeighbourhoodIndex(on=['var_arange']) pairs4 = index_cl4.index((self.a, self.b)) # situation 5 index_cl5 = recordlinkage.SortedNeighbourhoodIndex( left_on=['var_arange'], right_on=['var_arange']) pairs5 = index_cl5.index((self.a, self.b)) # test ptm.assert_index_equal(pairs1, pairs2) ptm.assert_index_equal(pairs1, pairs3) ptm.assert_index_equal(pairs1, pairs4) ptm.assert_index_equal(pairs1, pairs5)
def time_sni_index(self): # setup class c_pairs = rl.SortedNeighbourhoodIndex(on='given_name', w=5) # Make pairs c_pairs.index(self.A)
def ScoreRecords(): global features global df cols = df.columns.tolist() cols = cols[-1:] + cols[:-1] df = df[cols] ## create pairs indexer = rl.SortedNeighbourhoodIndex(on='given_name', window=5) pairs = indexer.index(train, test) compare_cl = rl.Compare() ## methodology for scoring compare_cl.exact('postcode', 'postcode', label='postcode') compare_cl.string('surname', 'surname', method='jaro', threshold=.95, label='surname') compare_cl.string('given_name', 'given_name', method='jaro', threshold=.95, label='name') compare_cl.string('date_of_birth', 'date_of_birth', method='jaro', threshold=0.85, label='dob') compare_cl.string('suburb', 'suburb', method='jaro', label='suburb', threshold=.85) compare_cl.string('state', 'state', label='state', method='jaro', threshold=.85) compare_cl.string('address_1', 'address_1', method='jaro', threshold=0.9, label='address_1') compare_cl.exact('rec_num', 'rec_num', label='rec_num') ##compute feature vector features = compare_cl.compute(pairs, train, test) total_score = [] features["Total_Score"] = features.sum(axis=1) features.fillna(0) y = [] for row in features["Total_Score"]: if row >= 7: y.append(1) else: y.append(0) features["target"] = y features.to_csv('feature_vectors.csv', sep=",", encoding='utf-8') return (features) ScoreRecords()
def test_sni_with_blocking_link(self): """SNI: Test sni with blocking keys.""" # sni index_class = recordlinkage.SortedNeighbourhoodIndex( on='var_arange', window=3, block_on='var_arange') pairs = index_class.index((self.a, self.b[0:len(self.a)])) # the length of pairs is length(self.a) self.assertEqual(len(pairs), len(self.a))
def test_sni_with_blocking_dedup(self): """SNI: Test sni with blocking keys.""" # sni index_class = recordlinkage.SortedNeighbourhoodIndex( on='var_arange', window=3, block_on='var_arange') pairs = index_class.index(self.a) print(pairs.values) # the length of pairs is 0 self.assertEqual(len(pairs), 0)
def test_sni_algorithm_dedup(self, window): """SNI: Test the window size (dedup).""" # window = 7 # using paramereized tests instead index_class = recordlinkage.SortedNeighbourhoodIndex(on='var_arange', window=window) pairs = index_class.index((self.a)) # the expected number of pairs window_d = (window - 1) / 2 len_a = len(self.a) n_pairs_expected = \ np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1)) # test self.assertEqual(len(pairs), n_pairs_expected)
def test_sni_algorithm_link(self, window): """SNI: Test the window size (link).""" # window = 7 # using paramereized tests instead index_class = recordlinkage.SortedNeighbourhoodIndex(on='var_arange', window=window) pairs = index_class.index((self.a, self.b[0:len(self.a)])) # the expected number of pairs window_d = (window - 1) / 2 len_a = len(self.a) n_pairs_expected = \ len(self.a) + \ 2 * np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1)) # test print('expected number of pairs: %s' % n_pairs_expected) print('number of pairs found: %s' % len(pairs)) self.assertEqual(len(pairs), n_pairs_expected)
import recordlinkage as rl from IOoperation import load_people_record, write_result, load_people_record_test, write_pair, get_result_list import time import os print("Load File Begins...") load_start_time = time.time() # dfA = load_people_record_test() dfA = load_people_record() load_end_time = time.time() print("Load File Costs:%f s" % (load_end_time - load_start_time)) # Blocking print("Blocking Begins...") pcl = rl.SortedNeighbourhoodIndex(on=['EmplyeeID'], window=3) pairs = pcl.index(dfA) blocking_end_time = time.time() print("Blocking Costs:%f s" % (blocking_end_time - load_end_time)) # Comparison print("Comparison Begins...") compare_helper = rl.Compare() compare_helper.exact('SSN', 'SSN', label='SSN') #compare_helper.string('EmplyeeID', 'EmplyeeID', threshold=0.85, label='EmplyeeID') compare_helper.string('FNAME', 'FNAME', method='jarowinkler', threshold=0.85, label='FNAME')
def ScoreRecords(): global features global df cols = df.columns.tolist() cols = cols[-1:] + cols[:-1] df = df[cols] ##create pairs x_train, y_train = train_test_split(train, test_size=0.5) indexer = rl.SortedNeighbourhoodIndex(on='given_name', window=3) pairs = indexer.index(x_train, y_train) compare_cl = rl.Compare() compare_cl.string('postcode', 'postcode', method='jaro', threshold=0.85, label='postcode') compare_cl.string('surname', 'surname', method='jaro', threshold=.95, label='surname') compare_cl.string('given_name', 'given_name', method='jaro', threshold=.95, label='name') compare_cl.string('date_of_birth', 'date_of_birth', method='jaro', threshold=0.85, label='dob') compare_cl.string('postcode', 'postcode', method='jaro', threshold=0.85, label='postcode') compare_cl.string('suburb', 'suburb', method='jaro', label='suburb', threshold=.85) compare_cl.string('state', 'state', label='state', method='jaro', threshold=.85) compare_cl.string('address_1', 'address_1', method='jaro', threshold=0.9, label='address_1') compare_cl.exact('rec_num', 'rec_num', label='rec_num') ##compute feature vector features = compare_cl.compute(pairs, x_train, y_train) total_score = [] features["Total_Score"] = features.sum(axis=1) ##look at distribution of scores across record-pairs plt.hist(features['Total_Score'], bins=8) plt.title(r'Distribution of feature score totals for record pairs') plt.show() y = [] for row in features["Total_Score"]: if row >= 6: y.append(1) else: y.append(0) features["y"] = y ###replace string in index columns with int features.to_csv('feature_vectors.csv', sep=",", encoding='utf-8') return (features)
class TestIndexApi(TestData): """General unittest for the indexing API.""" @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_repr(self, index_class): index_str = str(index_class) index_repr = repr(index_class) self.assertEqual(index_str, index_repr) start_str = '<{}'.format(index_class.__class__.__name__) self.assertTrue(index_str.startswith(start_str)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_arguments(self, index_class): """Test the index method arguments""" # The following should work index_class.index(self.a) index_class.index(self.a, self.b) index_class.index((self.a)) index_class.index([self.a]) index_class.index((self.a, self.b)) index_class.index([self.a, self.b]) index_class.index(x=(self.a, self.b)) def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = recordlinkage.FullIndex() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = recordlinkage.FullIndex() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it. @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_empty_imput_dataframes(self, index_class): """Empty DataFrames""" # make an empty dataframe with the columns of self.a and self.b df_a = pd.DataFrame(columns=self.a.columns.tolist()) df_b = pd.DataFrame(columns=self.b.columns.tolist()) if not isinstance(index_class, recordlinkage.RandomIndex): # make an index pairs = index_class.index((df_a, df_b)) # check if the MultiIndex has length 0 self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), 0) else: with self.assertRaises(ValueError): index_class.index((df_a, df_b)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_error_handling(self, index_class): """Test error handling on non-unique index.""" # make a non_unique index df_a = self.a.rename( index={self.a.index[1]: self.a.index[0]}, inplace=False) with self.assertRaises(ValueError): index_class.index(df_a) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_dedup(self, index_class): index_names = ['dedup', None, 'index', int(1)] expected = [ ['dedup_1', 'dedup_2'], [None, None], ['index_1', 'index_2'], ['1_1', '1_2'], ] for i, name in enumerate(index_names): index_A = pd.Index(self.a.index).rename(name) df_A = pd.DataFrame(self.a, index=index_A) pairs = index_class.index((df_A)) self.assertEqual(pairs.names, expected[i]) self.assertEqual(df_A.index.name, name) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_dedup(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) # make the index pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_link(self, index_class): # tuples with the name of the first and second index index_names = [ ('index1', 'index2'), ('index1', None), (None, 'index2'), (None, None), (10, 'index2'), (10, 11) ] for name_a, name_b in index_names: # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name=name_a) df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name=name_b) df_b = pd.DataFrame(self.b, index=index_b) pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, [name_a, name_b]) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, name_a) self.assertEqual(df_b.index.name, name_b) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_link(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_pickle(self, index_class): """Test if it is possible to pickle the class.""" pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle') # pickle before indexing pickle.dump(index_class, open(pickle_path, 'wb')) # compute the record pairs index_class.index(self.a, self.b) # pickle after indexing pickle.dump(index_class, open(pickle_path, 'wb'))
name = name.strip() auth.append(name) for Investigator in investigators: if Investigator in auth and Investigator != "": print("Investigator " , Investigator , " present in Author List") indexer = recordlinkage.SortedNeighbourhoodIndex(on="Title") pairs = indexer.index(pd,ct) compare_cl = recordlinkage.Compare() compare_cl.string('Title', 'Title', method='levenshtein',label="Levenshtein - Method")
import unittest import tempfile import shutil import pickle import numpy as np import pandas as pd import pandas.util.testing as ptm from parameterized import parameterized, param import recordlinkage TEST_INDEXATION_OBJECTS = [ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)), ] class TestData(unittest.TestCase): """Unittest object to setup test data.""" @classmethod def setUpClass(cls): n_a = 100 n_b = 150 cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)] cls.index_b = ['rec_b_%s' % i for i in range(0, n_b)]
import pandas as pd import recordlinkage as rl import labutils as lu # Import data names_1 = ['alfred', 'bob', 'calvin', 'hobbes', 'rusty'] names_2 = ['alfred', 'danny', 'callum', 'hobie', 'rusty'] df_a = pd.DataFrame(pd.Series(names_1, name='names')) df_b = pd.DataFrame(pd.Series(names_2, name='names')) df_c = pd.DataFrame( pd.Series(['alexander', 'bob', 'bruce', 'bruce', 'alexander'], name='names')) df_d = pd.DataFrame( pd.Series(['alice', 'beth', 'amy', 'beth', 'brittany'], name='names')) indexer = rl.SortedNeighbourhoodIndex(on='names', window=3) candidate_links = indexer.index(df_c, df_d) lu.clip_df(candidate_links.to_frame())
fieldnames.append(cfg.name) dfFile1 = tools.load_file_as_df(config.common.filename_1, fieldnames) dfFile2 = tools.load_file_as_df(config.common.filename_2, fieldnames) perfect_match_index = tools.load_perfect_match_as_index( config.common.filename_perfect_match) # for each config item for index, config_item in enumerate(config.items): # init Random with a fixes seed (for reproducibility) tools.init_random_with_seed() print("Indexing") if config_item.index_type == "sorted_neighbourhood": indexer = rl.SortedNeighbourhoodIndex( config_item.index_field_name, window=config_item.sorted_neighborhood_window) elif config_item.index_type == "block": indexer = rl.BlockIndex(config_item.index_field_name) elif config_item.index_type == "canopy": indexer = tools.CanopyClusterIndex( config_item.index_field_name, threshold_add=config_item.canopy_threshold_add, threshold_remove=config_item.canopy_threshold_remove) elif config_item.index_type == "full": indexer = tools.FullIndex(config_item.index_field_name) else: raise ValueError( "index_type {0} is invalid: must be sorted_neighbourhood, block, canopy or full" .format(config_item.index_type))
import sys import csv import numpy as np from matplotlib import pyplot #reload(sys) #sys.setdefaultencoding('utf8') #pd , ct = pandas.DataFrame.from_csv("ArticleData.csv") , pandas.DataFrame.from_csv("ClinicalTrialsData.csv") pd = pandas.read_csv("ArticleData.csv") ct = pandas.read_csv("ClinicalTrialsData.csv") #print pd indexer = recordlinkage.SortedNeighbourhoodIndex(on="DateRevised") #indexer = recordlinkage.FullIndex() pairs = indexer.index(pd, ct) #print (len(ct),len(pd),len(pairs)) compare_cl = recordlinkage.Compare() compare_cl.string('Title', 'Title', method='levenshtein', label="Levenshtein - Method") compare_cl.string('Title', 'Title', method='jarowinkler', label="Jarowinkler - Method")
def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg): schema_name = 'schema-' + str(schema_id) dedup_table_name = '_dedup_' + table_name + "_grouped" # TODO When user selects rows to remove, collect in table. # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO) try: # Remove complete duplicates before full dedup self.remove_identical_rows( schema_id, table_name, ) # SELECT id, 'column' FROM "schema_name"."table"; data_query = 'SELECT * FROM {}.{}'.format( *_ci(schema_name, table_name)) df = pd.read_sql(data_query, con=db.engine) df = df.set_index('id') # Clean dataset ## Remove leading whitespaces #df.columns = df.columns.to_series().apply(lambda x: x.strip()) if sorting_key not in fixed_column_names: fixed_column_names.append(sorting_key) string_columns = list(df.select_dtypes(include=['object']).columns) numerical_columns = list( df.select_dtypes(include=['int64']).columns) numerical_columns.extend( list(df.select_dtypes(include=['float64']).columns)) date_columns = list( df.select_dtypes(include=['datetime64[ns]']).columns) ## Clean string values for column_name in string_columns: df[column_name] = clean(df[column_name]) # Indexation step indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3) pairs = indexer.index(df) # Comparison step compare_cl = recordlinkage.Compare() ## Exact matches for column_name in fixed_column_names: compare_cl.exact(column_name, column_name, label=column_name) ## Variable matches calculated using an alg (levenshtein / numerical / date) for column_name in var_column_names: if column_name in numerical_columns: compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10) elif column_name in date_columns: compare_cl.date(column_name, column_name) elif column_name in string_columns: compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name) potential_pairs = compare_cl.compute(pairs, df) # Classification step kmeans = recordlinkage.KMeansClassifier() kmeans.learn(potential_pairs) matches = kmeans.predict(potential_pairs) if len(matches) == 0: return False # Grouping step ## Group matches (A,B), (B,C) into (A,B,C) groups = self.group_matches(matches) #TODO Create table _dedup_table_groups self.create_duplicate_table(schema_id, table_name, groups) return True except Exception as e: app.logger.error( "[ERROR] Unable to generate clusters of duplicate rows from table '{}'" .format(dedup_table_name)) app.logger.exception(e) raise e
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches