def test_random_index(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index block index_cl = rl.Pairs(A, B) pairs = index_cl.random(20) # Check if index is unique self.assertTrue(pairs.is_unique) # Check is number of pairs is correct self.assertEqual(len(pairs), 20) # index block index_cl = rl.Pairs(A, B) pairs = index_cl.random(5) # Check if index is unique self.assertTrue(pairs.is_unique) # Check is number of pairs is correct self.assertEqual(len(pairs), 5)
def time_block_index(self): # setup class c_pairs = rl.Pairs(self.A) # Make pairs c_pairs.block('given_name')
def test_index_unique_dedup(self): index_A = pd.Index(self.bad_index_a, name='left') A = pd.DataFrame(self.A, index=index_A) with self.assertRaises(IndexError): recordlinkage.Pairs(A)
def time_full_index(self): # setup class c_pairs = rl.Pairs(self.A) # Make pairs c_pairs.full()
def time_sni_index(self): # setup class c_pairs = rl.Pairs(self.A) # Make pairs c_pairs.sortedneighbourhood('given_name', 5)
def time_random_index(self): # setup class c_pairs = rl.Pairs(self.A) # Make pairs c_pairs.random(2500)
def test_random_index_errors(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index block index_cl = rl.Pairs(A, B) # Check if index is unique with self.assertRaises(ValueError): index_cl.random(-10) # Check if index is unique with self.assertRaises(ValueError): index_cl.random(0) # Check if index is unique with self.assertRaises(ValueError): index_cl.random(5.5) # Check if index is unique with self.assertRaises(ValueError): index_cl.random(5.0) # Check if index is unique with self.assertRaises(ValueError): index_cl.random('str')
def test_instance_dedup(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(self.A) index_cl = recordlinkage.Pairs(df_A) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertIsInstance(pairs, pd.MultiIndex)
def test_amount_of_pairs_dedup(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(self.A) index_cl = recordlinkage.Pairs(df_A) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) # Test is the amount of pairs is less than A*B self.assertTrue(len(pairs) <= len(df_A) * (len(df_A) - 1) / 2)
def test_pairs_are_unique_linking(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(self.A) df_B = pd.DataFrame(self.B) index_cl = recordlinkage.Pairs(df_A, df_B) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertTrue(pairs.is_unique)
def test_empty_dataframes_dedup(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(columns=self.A.keys()) if method_to_call != 'random': index_cl = recordlinkage.Pairs(df_A) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertIsInstance(pairs, pd.MultiIndex)
def test_index_unique_linking(self): index_A = pd.Index(self.bad_index_a, name='left') index_B = pd.Index(self.index_b, name='right') A = pd.DataFrame(self.A, index=index_A) B = pd.DataFrame(self.B, index=index_B) with self.assertRaises(IndexError): recordlinkage.Pairs(A, B)
def test_reduction_ratio_dedup(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(self.A) index_cl = recordlinkage.Pairs(df_A) getattr(index_cl, method_to_call)(*args, **kwargs) rr = index_cl.reduction self.assertTrue((rr >= 0.0) & (rr <= 1.0))
def find_all_matches(first_df, second_df): index = recordlinkage.Pairs(first_df, second_df, chunks=500) for links in index.full(): fill = recordlinkage.Compare(links, first_df, second_df) fill.string('first_name', 'first_name', method='jaro', threshold=0.72) fill.string('last_name', 'last_name', method='jaro', threshold=0.83) return get_df_from_vector(fill.vectors, first_df, second_df)
def test_index_names_linking(self, method_to_call, *args, **kwargs): """ Check modification of index name """ df_A = pd.DataFrame(self.A) df_B = pd.DataFrame(self.B) index_cl = recordlinkage.Pairs(df_A, df_B) getattr(index_cl, method_to_call)(*args, **kwargs) # prevent that the index name is changed self.assertEqual(df_A.index.name, None) self.assertEqual(df_B.index.name, None)
def setup(self): # download data self.A, self.B = load_febrl4() # Add numbers (age) self.A['postcode'] = self.A['postcode'].astype(float) self.B['postcode'] = self.B['postcode'].astype(float) # make pairs c_pairs = rl.Pairs(self.A, self.B) self.pairs = c_pairs.full()[0:5e4]
def test_full_iter_index_dedup(self, method_to_call, *args, **kwargs): df_A = pd.DataFrame(self.A) index_cl = recordlinkage.Pairs(df_A, chunks=100) index = recordlinkage.Pairs(df_A) # Compute pairs in one iteration pairs_single = getattr(index, method_to_call)(*args, **kwargs) # Compute pairs in iterations n_pairs_iter = 0 for pairs in getattr(index_cl, method_to_call)(*args, **kwargs): print(len(pairs)) n_pairs_iter += len(pairs) # Check if index is unique self.assertTrue(pairs.is_unique) self.assertEqual(len(pairs_single), n_pairs_iter)
def test_index_names_not_none_dedup(self, method_to_call, *args, **kwargs): # setup A index_A = pd.Index(self.index_a, name='dedup') df_A = pd.DataFrame(self.A, index=index_A) # Make pairs index_cl = recordlinkage.Pairs(df_A) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertEqual(pairs.names, ['dedup', 'dedup']) self.assertEqual(df_A.index.name, 'dedup')
def runSimpleTest2(find_address={'street': ['6 PROSPCT GARDNS EXTER',], 'postcode': ['EX4 6TA',]}): """ Matching a given address against the database. Uses postcode blocking and Damerau - Levenshtein distance for the remaining address. Finds the best match by identifying the maximum of the sum of the comparison vectors. :param find_address: :type find_address: dict :return: None """ print('Matching') pprint.pprint(find_address) # get data from the database against which we are linking df = getData() df['postcode'] = df.apply(data._getPostcode, axis=1) df['street'] = df.apply(data._removePostcode, axis=1) df.drop(['address', ], axis=1, inplace=True) # data frame of the one being linked find = pd.DataFrame(find_address) print('Start parsed matching with postcode blocking...') start = datetime.datetime.now() # set blocking pcl = recordlinkage.Pairs(df, find) pairs = pcl.block('postcode') print('\nAfter blocking, need to test', len(pairs)) # compare the two data sets - use different metrics for the comparison compare = recordlinkage.Compare(pairs, df, find, batch=True) # compare.string('postcode', 'postcode', method='jarowinkler', threshold=0.95, name='postcode') # compare.string('street', 'street', method='damerau_levenshtein', threshold=0.85, name='street') compare.string('postcode', 'postcode', method='jarowinkler', name='postcode_jw') compare.string('street', 'street', method='damerau_levenshtein', name='street_dl') compare.run() # The comparison vectors print('\nComparison vectors:') print(compare.vectors) # find the matches and the best match matchmetrics = compare.vectors.sum(axis=1) potentialMatches = matchmetrics.index.levels[0].tolist() print('\nPotential Matches:') print(df.loc[potentialMatches]) print('\nBest Match:') print(df.loc[matchmetrics.argmax()[0]]) stop = datetime.datetime.now() print('\nRun in', round((stop - start).microseconds/1.e6, 2), 'seconds...')
def setup(self): # download data self.A = load_febrl1() # make pairs c_pairs = rl.Pairs(self.A) pairs = c_pairs.full() # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def test_sni_index(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index full index_cl = rl.Pairs(A, B) pairs = index_cl.sortedneighbourhood('name') # Check if index is unique self.assertTrue(pairs.is_unique) # Check is number of pairs is correct self.assertTrue(len(pairs) <= len(A) * len(B))
def test_full_index(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index full index_cl = rl.Pairs(A, B) pairs = index_cl.full() # Check if index is unique self.assertTrue(pairs.is_unique) # Check is number of pairs is correct self.assertEqual(len(pairs), len(self.fullindex))
def test_qgram_index(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index block index_cl = rl.Pairs(A, B) pairs = index_cl.qgram('name') # Check if index is unique self.assertTrue(pairs.is_unique) # Check is number of pairs is correct self.assertTrue(len(pairs) <= len(A) * len(B))
def test_index_names_eq_linking(self, method_to_call, *args, **kwargs): # setup A index_A = pd.Index(self.index_a, name='leftright') df_A = pd.DataFrame(self.A, index=index_A) # setup B index_B = pd.Index(self.index_b, name='leftright') df_B = pd.DataFrame(self.B, index=index_B) # Make pairs index_cl = recordlinkage.Pairs(df_A, df_B) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertEqual(pairs.names, ['leftright', 'leftright']) self.assertEqual(df_A.index.name, 'leftright') self.assertEqual(df_B.index.name, 'leftright')
def test_index_names_none_linking(self, method_to_call, *args, **kwargs): # setup A index_A = pd.Index(self.index_a) df_A = pd.DataFrame(self.A, index=index_A) # setup B index_B = pd.Index(self.index_b) df_B = pd.DataFrame(self.B, index=index_B) # Make pairs index_cl = recordlinkage.Pairs(df_A, df_B) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) self.assertEqual(pairs.names, [None, None]) self.assertEqual(df_A.index.name, None) self.assertEqual(df_B.index.name, None)
def test_blocking_special_case_of_sorting(self): A = pd.DataFrame(self.data_A, index=pd.Index( self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index( self.index, name='index_b')) # index block index_cl = rl.Pairs(A, B) bl = index_cl.block('name') sn = index_cl.sortedneighbourhood('name', window=1) print('The number of record pairs found with blocking', len(bl)) print("The number of record pairs found with sorted " + "neighbourhood indexing", len(sn)) # The length of the union should be the same as the length of bl or sn. self.assertEqual(len(bl), len(sn))
def test_index_names_one_none_linking(self, method_to_call, *args, **kwargs): # setup A index_A = pd.Index(self.index_a, name=None) df_A = pd.DataFrame(self.A, index=pd.Index(index_A, name=None)) # setup B index_B = pd.Index(self.index_b, name=None) df_B = pd.DataFrame(self.B, index=pd.Index(index_B, name='right')) # Make pairs index_cl = recordlinkage.Pairs(df_A, df_B) pairs = getattr(index_cl, method_to_call)(*args, **kwargs) print(pairs.names) print(df_A.index.name) self.assertEqual(pairs.names, [None, 'right']) self.assertEqual(df_A.index.name, None) self.assertEqual(df_B.index.name, 'right')
def test_sni_index_errors(self): A = pd.DataFrame(self.data_A, index=pd.Index(self.index, name='index_a')) B = pd.DataFrame(self.data_B, index=pd.Index(self.index, name='index_b')) # index full index_cl = rl.Pairs(A, B) with self.assertRaises(ValueError): index_cl.sortedneighbourhood('name', -3) with self.assertRaises(ValueError): index_cl.sortedneighbourhood('name', 2) with self.assertRaises(ValueError): index_cl.sortedneighbourhood('name', 'str') with self.assertRaises(ValueError): index_cl.sortedneighbourhood('name', 2.5)
def testImpactOfNone(fill=False): """ :return: """ # create fake data toMatch = pd.DataFrame({ 'SubBuildingName': ['FLAT 1', None], 'Postcode': ['AA', 'AA'] }) AddressBase = pd.DataFrame({ 'SubBuildingNameAB': ['FLAT 1', 'FLAT A', None], 'PostcodeAB': ['AA', 'AA', 'AA'] }) if fill: msk = toMatch['SubBuildingName'].isnull() toMatch.loc[msk, 'SubBuildingName'] = 'NULL' msk = AddressBase['SubBuildingNameAB'].isnull() AddressBase.loc[msk, 'SubBuildingNameAB'] = 'NULL' # create pairs pcl = recordlinkage.Pairs(toMatch, AddressBase) # set blocking pairs = pcl.block(left_on=['Postcode'], right_on=['PostcodeAB']) print('Need to test', len(pairs), 'pairs for', len(toMatch.index), 'addresses...') # compare the two data sets - use different metrics for the comparison compare = recordlinkage.Compare(pairs, AddressBase, toMatch, batch=True) compare.string('SubBuildingNameAB', 'SubBuildingName', missing_value=0.1, method='damerau_levenshtein', name='test') compare.run() # add sum of the components to the comparison vectors dataframe compare.vectors['similarity_sum'] = compare.vectors.sum(axis=1) # find all matches where the metrics is above the chosen limit - small impact if choosing the best match matches = compare.vectors.loc[compare.vectors['similarity_sum'] > -1.] # sort matches by the sum of the vectors matches = matches.sort_values(by='similarity_sum', ascending=False) # reset indices matches = matches.reset_index() toMatch = toMatch.reset_index() AddressBase = AddressBase.reset_index() # join to the original data = pd.merge(matches, toMatch, how='left', left_on='level_0', right_on='index') data = pd.merge(data, AddressBase, how='left', left_on='level_1', right_on='index') print(data)
B = pd.DataFrame(B) B['lastname'] = ['Smith', 'Clayton', 'Smith', 'Payne', 'Smith', 'Taylor'] B['age'] = [11, 34, 32, 21, 56, 72] B['sex'] = ['M', 'M', 'M', 'F', 'F', 'F'] B['lat'] = [0.23, 10.54, 45.98, 23.43, 10.55, 65.21] B['lon'] = [110.32, -66.98, 10.12, 45.45, -66.96, 32.04] B.index.name = 'B' #notice that I add a name to the index, necessary for rl print() print(A) print() print(B) #------------------- #Record linkage A = A #specify the dataframes to compare B = B #specify the dataframes to compare index = rl.Pairs(A, B) candidate_links = index.block('lastname') compare = rl.Compare(candidate_links, A, B) ''' compare.string('name', 'name', method='jarowinkler', threshold=0.85)''' #I'm having issues with this attribute. compare.exact('sex', 'sex') compare.exact('age', 'age')