def test_single_blocking_key(self): """BLOCKING: Test class arguments.""" # all the following cases return in the same index. # situation 1 index_cl1 = Block('var_arange') pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = Block(on='var_arange') pairs2 = index_cl2.index((self.a, self.b)) # situation 3 index_cl3 = Block(left_on='var_arange', right_on='var_arange') pairs3 = index_cl3.index((self.a, self.b)) # situation 4 index_cl4 = Block(on=['var_arange']) pairs4 = index_cl4.index((self.a, self.b)) # situation 5 index_cl5 = Block(left_on=['var_arange'], right_on=['var_arange']) pairs5 = index_cl5.index((self.a, self.b)) # test ptm.assert_index_equal(pairs1, pairs2) ptm.assert_index_equal(pairs1, pairs3) ptm.assert_index_equal(pairs1, pairs4) ptm.assert_index_equal(pairs1, pairs5)
def test_depr_on_argument(self): index_cl_new = Block('var_arange') pairs_new = index_cl_new.index(self.a) index_cl_old = Block(on='var_arange') pairs_old = index_cl_old.index(self.a) ptm.assert_index_equal(pairs_new, pairs_old)
def test_depr_on_argument(self): index_cl_new = Block('var_arange') pairs_new = index_cl_new.index(self.a) with pytest.deprecated_call(): index_cl_old = Block(on='var_arange') pairs_old = index_cl_old.index(self.a) pdt.assert_index_equal(pairs_new, pairs_old)
def test_multiple_blocking_keys(self): """BLOCKING: test multiple blocking keys""" # all the following cases return in the same index. # situation 1 index_cl1 = Block(['var_arange', 'var_block10']) pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = Block(left_on=['var_arange', 'var_block10'], right_on=['var_arange', 'var_block10']) pairs2 = index_cl2.index((self.a, self.b)) # test ptm.assert_index_equal(pairs1, pairs2)
def test_blocking_algorithm_link(self): """BLOCKING: test blocking algorithm for linking""" # situation 1: eye index index_cl1 = Block(on='var_arange') pairs1 = index_cl1.index((self.a, self.b)) assert len(pairs1) == len(self.a) assert pairs1.is_unique # situation 2: 10 blocks index_cl2 = Block(on='var_block10') pairs2 = index_cl2.index((self.a, self.b)) assert len(pairs2) == len(self.a) * 10 assert pairs2.is_unique # situation 3: full index index_cl3 = Block(on='var_single') pairs3 = index_cl3.index((self.a, self.b)) assert len(pairs3) == len(self.a) * len(self.b) assert pairs3.is_unique
def test_blocking_algorithm_dedup(self): """BLOCKING: test blocking algorithm for deduplication""" len_a = len(self.a) # situation 1: eye index index_cl1 = Block(on='var_arange') pairs1 = index_cl1.index(self.a) assert len(pairs1) == 0 assert pairs1.is_unique # situation 2: 10 blocks index_cl2 = Block(on='var_block10') pairs2 = index_cl2.index(self.a) assert len(pairs2) == (len_a * 10 - len_a) / 2 assert pairs2.is_unique # situation 3: full index index_cl3 = Block(on='var_single') pairs3 = index_cl3.index(self.a) assert len(pairs3) == (len_a * len_a - len_a) / 2 assert pairs3.is_unique
def test_add_dedup(self): indexer1 = Full() indexer2 = Block(left_on='var_arange', right_on='var_arange') expected = indexer1.index(self.a).union(indexer2.index(self.a)) indexer = recordlinkage.Index() indexer.add( [Full(), Block(left_on='var_arange', right_on='var_arange')]) result = indexer.index(self.a) ptm.assert_index_equal(result, expected)
def processing(df, sourceid): if sourceid == 1: postal_indexer = Block('PostCodeKey') postal_pairs = postal_indexer.index(df) for i in [20, 40, 60, 80, 100]: if (len(postal_pairs) / i) < 1000000: intervalparts = i break else: intervalparts = 100 # Get Interval Parts inter = intervals(intervalparts, len(postal_pairs)) comp_postal = recordlinkage.Compare(n_jobs=20) comp_postal.string('BusinessNameKey', 'BusinessNameKey', method='jarowinkler', label='BusinesNameCompare') comp_postal.string('TradestyleKey', 'BusinessNameKey', method='jarowinkler', label='BNTSCompare') comp_postal.string('AddressKey', 'AddressKey', method='jarowinkler', label='AddressCompare') cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df) cv_full = cv_full[ ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True)) | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True))) & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))] for i in range(1, len(inter) - 1): cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]], df) cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True)) | (cv.BNTSCompare.between(0.95, 1, inclusive=True))) & (cv.AddressCompare.between(0.95, 1, inclusive=True))] frames = [cv_full, cv] cv_full = pd.concat(frames) del cv # print(df.columns) # print(cv_full.columns) return df, cv_full
def test_annotation_link(tmp_path): path = tmp_path / "febrl_annotation_link.json" # get febrl4 file df_a, df_b, matches = load_febrl4(return_links=True) # get record pairs indexer = Block("given_name", "given_name") pairs = indexer.index(df_a, df_b) # create annotation file # write an annotation file for the Febrl4 dataset. rl.write_annotation_file(path, pairs[0:10], df_a, df_b) # read the result result = rl.read_annotation_file(path) assert result.links is None assert result.distinct is None
df.head(6) import recordlinkage as rl from recordlinkage.index import Full full_indexer = Full() pairs = full_indexer.index(df) print(f"Full index: {len(df)} records, {len(pairs)} pairs") from recordlinkage.index import Block postal_indexer = Block('postal') pairs = postal_indexer.index(df) print(f"Postal index: {len(pairs)} pairs") pairs.to_frame()[:10].values pd.DataFrame([[0.5, 0.8, 0.9, 1]], columns=['name', 'addr', 'postal', 'latlng'], index=pd.MultiIndex.from_arrays([[100], [200]])) comp = rl.Compare() comp.string('name', 'name', method='jarowinkler', label='name') comp.string('addr', 'addr', method='jarowinkler', label='addr') comp.string('postal', 'postal', method='jarowinkler', label='postal')