def __init__(self, data, new_data, name, n_sample=50000, bin_size=500, block=None, method='jarowinkler', threshold=0.93): super(Linkage, self).__init__() self.data = data self.new_data = pd.DataFrame(new_data) self.n_sample = n_sample self.bin_size = bin_size self.name = name self.block = block self.method = method self.threshold = threshold self.sample = data.sample(n=self.n_sample, replace=False, random_state=0) if self.block != None: self.indexer = recordlinkage.BlockIndex(on=self.block) else: self.indexer = recordlinkage.FullIndex() self.compare_cl = recordlinkage.Compare(n_jobs=4) self.compare_cl.string(self.name, self.name, method=self.method, threshold=self.threshold, label=self.name) if self.new_data.empty: self.List = list(itertools.combinations(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)), 2)) else: self.Linst = list(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0))) self.results_df = pd.DataFrame(columns=['pairs', 'company_1', 'company_2']) self.results_tmp = None
def time_full_index(self): # setup class c_pairs = rl.FullIndex() # Make pairs c_pairs.index(self.A)
def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = recordlinkage.FullIndex() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = recordlinkage.FullIndex() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split)
def test_basic_link(self): """FULL: Test basic characteristics of full indexing (link).""" # finding duplicates index_cl = recordlinkage.FullIndex() pairs = index_cl.index((self.a, self.b)) self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), len(self.a) * len(self.b)) self.assertTrue(pairs.is_unique)
def setup(self): # download data self.A, self.B = load_febrl4() # Add numbers (age) self.A['postcode'] = self.A['postcode'].astype(float) self.B['postcode'] = self.B['postcode'].astype(float) # make pairs c_pairs = rl.FullIndex() self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
def setup(self): # download data self.A = load_febrl1() # make pairs c_pairs = rl.FullIndex() pairs = c_pairs.index(self.A) # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def record_link_schools(): """ This function performs record linkage on two dataframes: the critical mass dataframe and the retention rates dataframe. The record linkage is condicted on the name of the school. Input: None Output: - link: a tuple containing the indices of retention dataframe and the critical mass dataframe; AND the best matches qgram scores """ critical_mass_df = calculate_critical_mass_var() retention_df = import_cleaned_retention(RETENTION_CLEAN) # set thresholds for comparing strings using qgram method school_name_thresh = 0.85 # initialize a Record Linkage comparison object compare = rl.Compare() indexer = rl.FullIndex() # No blocking available compare.string('school', 'school', method='qgram', threshold=school_name_thresh, label='school_name_score') # make pairs pairs = indexer.index(retention_df, critical_mass_df) # compute record linkage scores features = compare.compute(pairs, retention_df, critical_mass_df) # set classification threshold school_name_classif_thresh = 1.0 # Classification & Final Filtering best_matches = features[(features['school_name_score'] >= school_name_classif_thresh)] # obtain the index values from best_matches index_array = best_matches.index.values # create tuple of indices and best matches df link = (index_array, best_matches) return link
def match(self, df:pd.DataFrame, key='all') -> pd.DataFrame: metadata = { 'size': len(df) } logger.debug('Indexing the data for matching!') indexer = rl.FullIndex() pairs = indexer.index(df) metadata['n_pairs'] = len(pairs) logger.debug(f"Number of pairs: {metadata['n_pairs']}") logger.debug(f"Initializing contrasting") contraster_obj = contraster.Contraster(self.contrast_rules) contrasts = contraster_obj.run(pairs, df) metadata['contraster_metadata'] = contraster_obj.metadata logger.debug(f"Contrasts created") contrasts.index.rename(['matcher_index_left', 'matcher_index_right'], inplace=True) contrasts = rules.compactify(contrasts, operation='mean') logger.debug('Summary distances generated. Making you some stats about them.') metadata['scores'] = utils.summarize_column(contrasts.matches) logger.debug('Caching those contrasts and distances for you.') ioutils.write_dataframe(contrasts.reset_index(), filepath=f'{self.base_data_directory}/match_cache/contrasts/{self.match_job_id}/{key}') logger.debug(f"Contrasts dataframe size: {contrasts.shape}") logger.debug(f"Contrasts data without duplicated indexes: {contrasts[~contrasts.index.duplicated(keep='first')].shape}") logger.debug("Duplicated keys:") logger.debug(f"{contrasts[contrasts.index.duplicated(keep=False)]}") matches = cluster.generate_matched_ids( distances=contrasts, DF=df, clustering_params=self.clustering_rules, base_data_directory=self.base_data_directory, # at some point, we may want to consider making the matcher into a class match_job_id=self.match_job_id, # rather than passing around keys, match_job_ids, base_data_directorys, etc. block_name=str(key) ) return matches, metadata
import os import unittest import tempfile import shutil import pickle import numpy as np import pandas as pd import pandas.util.testing as ptm from parameterized import parameterized, param import recordlinkage TEST_INDEXATION_OBJECTS = [ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)), ] class TestData(unittest.TestCase): """Unittest object to setup test data.""" @classmethod def setUpClass(cls): n_a = 100 n_b = 150
assert (len(config.common.fields) == 1), "Only one Field is allowed for fields" fieldname = config.common.fields[0].name tools.ensure_directories(config.common.result_base_dir + "dummy") # init Random with a fixes seed (for reproducibility) tools.init_random_with_seed() # load files print("Loading files") df_1 = tools.load_file_as_df(config.common.filename_1, [fieldname]) df_2 = tools.load_file_as_df(config.common.filename_2, [fieldname]) idx_match = tools.load_perfect_match_as_index(config.common.filename_perfect_match) # build a full index without the matches idx_full = rl.FullIndex().index(df_1, df_2) idx_distinct = sample_index(idx_full.difference(idx_match), len(idx_match) * 10) # run compare print("Compare matches") df_match = run_compare(fieldname, df_1, df_2, idx_match) print("Compare distincts") df_distinct = run_compare(fieldname, df_1, df_2, idx_distinct) # save result save_result(df_match, 'cm_matches.csv') save_result(df_distinct, 'cm_distinct.csv') save_binned_result(df_match, df_distinct, 25, 'cm_bin_{0}.csv') print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))
def link_datasets(yelp_results, dj_df, df_type="wages"): """ (Assisted by Record Linkage Toolkit library and documentation) This functions compares the Yelp query results to database results and produces the best matches based on computing the qgram score. Depending on the specific database table characteristics the qgram calculation will be between the zip_code, business name, address strings, latitude, longitude, or a combination of those charateristics. Inputs: - yelp_results: a pandas dataframe of yelp business results based on a user's input - dj_df: a pandas dataframe of django results. Ex. labour statistics, healthcode violations, Divvy, etc. - df_type: a string of which specific dataframe is being passed to be compared to the Yelp results Outputs: - link: a tuple containing the indices of Yelp query results dataframe and the database dataframe AND the best matches qgram scores """ # set thresholds for comparing strings using qgram method name_thresh = 0.55 addr_thresh = 0.55 strong_addr_thresh = 0.90 # initialize a Record Linkage comparison object compare = rl.Compare() # Labour & Food data comparisons to Yelp are made on zip, business name, # and address if df_type == "wages" or df_type == "food": indexer = rl.BlockIndex(on='zip_code') # block on zip code compare.numeric('zip_code', 'zip_code', method='linear', scale=30.0, label='zip_score') compare.string('name', 'name', method='qgram', threshold=name_thresh, label='name_score') compare.string('addr', 'addr', method='qgram', threshold=addr_thresh, label='addr_score') # Environmental data comparisons to Yelp are made on address elif df_type == "enviro": indexer = rl.FullIndex() # no blocking available compare.string('addr', 'addr', method='qgram', threshold=strong_addr_thresh, label='addr_score') # all other data comparisons to Yelp else: indexer = rl.FullIndex() pairs = indexer.index(yelp_results, dj_df) # In addition to above comparisons, ALL data sets are also compared to # Yelp based on latitude and longitude compare.geo('latitude', 'longitude', 'latitude', 'longitude', method='linear', scale=30.0, label='coord_score') # compute record linkage scores features = compare.compute(pairs, yelp_results, dj_df) # set classification thresholds zip_classif_thresh = 1.0 addr_classif_thresh = 1.0 coord_classif_thresh = 0.99 name_classif_thresh = 1.0 # Classification and final filtering if df_type == "wages" or df_type == "food": best_matches = features[(features['zip_score'] == zip_classif_thresh) & (features['name_score'] == name_classif_thresh) & (features['addr_score'] == addr_classif_thresh) & (features['coord_score'] >= coord_classif_thresh)] elif df_type == "enviro": best_matches = features[(features['addr_score'] == addr_classif_thresh) & (features['coord_score'] >= coord_classif_thresh)] else: best_matches = features[(features['coord_score'] >= coord_classif_thresh)] # obtain the index values from best_matches index_array = best_matches.index.values # create tuple of indices and best matches df link = (index_array, best_matches) return link
import recordlinkage import pandas as pd path = "C:/Users/DELL/Desktop/Data-de-duplication/Deduplication Problem - Sample Dataset.csv" df = pd.read_csv(path) print(df) indexer = recordlinkage.FullIndex() pairs = indexer.index(df) print(len(df), len(pairs)) print(pairs) indexer = recordlinkage.BlockIndex(on=['dob', 'gn']) pairs = indexer.index(df) print(len(pairs)) # (1000*1000-1000)/2 = 499500 # This cell can take some time to compute. compare_cl = recordlinkage.Compare() compare_cl.string('ln', 'ln', label='last_name') compare_cl.string('fn', 'fn', method='jarowinkler', threshold=0.85, label='first_name') compare_cl.exact('dob', 'dob', label='date_of_birth') compare_cl.exact('gn', 'gn', label='gender')
class TestIndexApi(TestData): """General unittest for the indexing API.""" @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_repr(self, index_class): index_str = str(index_class) index_repr = repr(index_class) self.assertEqual(index_str, index_repr) start_str = '<{}'.format(index_class.__class__.__name__) self.assertTrue(index_str.startswith(start_str)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_arguments(self, index_class): """Test the index method arguments""" # The following should work index_class.index(self.a) index_class.index(self.a, self.b) index_class.index((self.a)) index_class.index([self.a]) index_class.index((self.a, self.b)) index_class.index([self.a, self.b]) index_class.index(x=(self.a, self.b)) def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = recordlinkage.FullIndex() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = recordlinkage.FullIndex() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it. @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_empty_imput_dataframes(self, index_class): """Empty DataFrames""" # make an empty dataframe with the columns of self.a and self.b df_a = pd.DataFrame(columns=self.a.columns.tolist()) df_b = pd.DataFrame(columns=self.b.columns.tolist()) if not isinstance(index_class, recordlinkage.RandomIndex): # make an index pairs = index_class.index((df_a, df_b)) # check if the MultiIndex has length 0 self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), 0) else: with self.assertRaises(ValueError): index_class.index((df_a, df_b)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_error_handling(self, index_class): """Test error handling on non-unique index.""" # make a non_unique index df_a = self.a.rename( index={self.a.index[1]: self.a.index[0]}, inplace=False) with self.assertRaises(ValueError): index_class.index(df_a) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_dedup(self, index_class): index_names = ['dedup', None, 'index', int(1)] expected = [ ['dedup_1', 'dedup_2'], [None, None], ['index_1', 'index_2'], ['1_1', '1_2'], ] for i, name in enumerate(index_names): index_A = pd.Index(self.a.index).rename(name) df_A = pd.DataFrame(self.a, index=index_A) pairs = index_class.index((df_A)) self.assertEqual(pairs.names, expected[i]) self.assertEqual(df_A.index.name, name) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_dedup(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) # make the index pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_link(self, index_class): # tuples with the name of the first and second index index_names = [ ('index1', 'index2'), ('index1', None), (None, 'index2'), (None, None), (10, 'index2'), (10, 11) ] for name_a, name_b in index_names: # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name=name_a) df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name=name_b) df_b = pd.DataFrame(self.b, index=index_b) pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, [name_a, name_b]) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, name_a) self.assertEqual(df_b.index.name, name_b) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_link(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_pickle(self, index_class): """Test if it is possible to pickle the class.""" pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle') # pickle before indexing pickle.dump(index_class, open(pickle_path, 'wb')) # compute the record pairs index_class.index(self.a, self.b) # pickle after indexing pickle.dump(index_class, open(pickle_path, 'wb'))
import pandas as pd import recordlinkage as rl import helpers as h h.pandas_setup() df_a = pd.DataFrame.from_csv('./data/grid-df.csv').reset_index() df_b = pd.DataFrame.from_csv( './data/scraped/scraped-universities-subset.csv').reset_index() indexer = rl.FullIndex() candidate_links = indexer.index(df_a, df_b) comp = rl.Compare(candidate_links, df_a, df_b) comp.string('grid-name', 'scraped-name') preds = [(1 if i >= .9 else 0) for i in comp.vectors[0]] fuse = rl.FuseLinks()
def index_pairs(df1, df2): indexer = recordlinkage.FullIndex() # BlockIndex(on='postal_code') return indexer.index(df1, df2)