def test_datasets_existance(self): # Load all datasets load_febrl1() load_febrl2() load_febrl3() load_febrl4()
def test_febrl1(self): df = load_febrl1() self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 1000) df, links = load_febrl1(return_links=True) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 1000) self.assertIsInstance(links, pandas.MultiIndex)
def test_full_iter_index_deduplication(self): dfA = datasets.load_febrl1() # Compute pairs in one iteration index = recordlinkage.Pairs(dfA) pairs_single = index.full() # Compute pairs in iterations n_pairs_iter = 0 index_chucks = recordlinkage.Pairs(dfA, chunks=100) for pairs in index_chucks.full(): print (len(pairs)) n_pairs_iter += len(pairs) # Check if index is unique self.assertTrue(pairs.is_unique) self.assertEqual(len(pairs_single), n_pairs_iter) # Check is number of pairs is correct self.assertEqual(n_pairs_iter, (len(dfA)-1)*len(dfA)/2)
def setup(self): # download data self.A = load_febrl1() # make pairs c_pairs = rl.FullIndex() pairs = c_pairs.index(self.A) # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def setup(self): # download data self.A = load_febrl1() # make pairs c_pairs = rl.Pairs(self.A) pairs = c_pairs.full() # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def test_annotation_dedup(tmp_path): path = tmp_path / "febrl_annotation_dedup.json" # get febrl4 file df_a, matches = load_febrl1(return_links=True) # get record pairs indexer = Block("given_name", "given_name") pairs = indexer.index(df_a) # create annotation file # write an annotation file for the Febrl4 dataset. rl.write_annotation_file(path, pairs[0:10], df_a) # read the result result = rl.read_annotation_file(path) assert result.links is None assert result.distinct is None
def setup(self): # download data self.A = load_febrl1()
# Working directory os.chdir("D:/trainings/NLP") # fix random seed for reproducibility seed_value = 123 np.random.seed(seed_value) #%% Entity resolution /Deduplication from single table #Data sets are at link https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html from recordlinkage.datasets import load_febrl1 #This data set contains 1000 records (500 original and 500 duplicates, with exactly #one duplicate per original record. #load data train = load_febrl1() train.columns = map(str.upper, train.columns) # First view train.shape train.dtypes train.info() # Make list of data types and convert to approprite data types list_text_data = [ 'GIVEN_NAME', 'SURNAME', 'STREET_NUMBER', 'ADDRESS_1', 'ADDRESS_2', 'SUBURB', 'STATE' ] list_int_data = ['POSTCODE', 'SOC_SEC_ID'] train[list_text_data] = train[list_text_data].apply(lambda x: x.astype(str)) train[list_int_data] = train[list_int_data].apply(lambda x: x.astype(int))