def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = Full() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = Full() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split)
def test_basic_link(self): """FULL: Test basic characteristics of full indexing (link).""" from recordlinkage.index import Full # finding duplicates index_cl = Full() pairs = index_cl.index((self.a, self.b)) assert isinstance(pairs, pd.MultiIndex) assert len(pairs) == len(self.a) * len(self.b) assert pairs.is_unique
def test_basic_dedup(self): """FULL: Test basic characteristics of full indexing (dedup).""" from recordlinkage.index import Full # finding duplicates index_cl = Full() pairs = index_cl.index(self.a) self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), len(self.a) * (len(self.a) - 1) / 2) self.assertTrue(pairs.is_unique)
def test_add_dedup(self): indexer1 = Full() indexer2 = Block(left_on='var_arange', right_on='var_arange') expected = indexer1.index(self.a).union(indexer2.index(self.a)) indexer = recordlinkage.Index() indexer.add( [Full(), Block(left_on='var_arange', right_on='var_arange')]) result = indexer.index(self.a) ptm.assert_index_equal(result, expected)
def assign_postal_lat_lng(df): addresses = df['addr'].str.cat(df['city'], sep=', ') addresses_to_postal = [address_to_postal.get(a) for a in addresses] addresses_to_lat = [address_to_latlng[a][0] if a in address_to_latlng else None for a in addresses] addresses_to_lng = [address_to_latlng[a][1] if a in address_to_latlng else None for a in addresses] return df.assign(postal=addresses_to_postal, lat=addresses_to_lat, lng=addresses_to_lng) df = assign_postal_lat_lng(df) df.head(6) import recordlinkage as rl from recordlinkage.index import Full full_indexer = Full() pairs = full_indexer.index(df) print(f"Full index: {len(df)} records, {len(pairs)} pairs") from recordlinkage.index import Block postal_indexer = Block('postal') pairs = postal_indexer.index(df) print(f"Postal index: {len(pairs)} pairs") pairs.to_frame()[:10].values pd.DataFrame([[0.5, 0.8, 0.9, 1]],