def test_datasets_existance(self): # Load all datasets load_febrl1() load_febrl2() load_febrl3() load_febrl4()
def test_febrl4(self): dfa, dfb = load_febrl4() self.assertIsInstance(dfa, pandas.DataFrame) self.assertIsInstance(dfb, pandas.DataFrame) self.assertEqual(len(dfa), 5000) self.assertEqual(len(dfb), 5000) dfa, dfb, links = load_febrl4(return_links=True) self.assertIsInstance(dfa, pandas.DataFrame) self.assertIsInstance(dfb, pandas.DataFrame) self.assertEqual(len(dfa), 5000) self.assertEqual(len(dfb), 5000) self.assertIsInstance(links, pandas.MultiIndex)
def test_febrl_links(): dfa, dfb, links = load_febrl4(return_links=True) assert isinstance(dfa, pandas.DataFrame) assert isinstance(dfb, pandas.DataFrame) assert len(dfa) == 5000 assert len(dfb) == 5000 assert isinstance(links, pandas.MultiIndex)
def test_febrl4(): dfa, dfb = load_febrl4() assert isinstance(dfa, pandas.DataFrame) assert isinstance(dfb, pandas.DataFrame) assert len(dfa) == 5000 assert len(dfb) == 5000
def setup(self): # download data self.A, self.B = load_febrl4() # Add numbers (age) self.A['postcode'] = self.A['postcode'].astype(float) self.B['postcode'] = self.B['postcode'].astype(float) # make pairs c_pairs = rl.FullIndex() self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
def setup(self): # download data self.A, self.B = load_febrl4() # Add numbers (age) self.A['postcode'] = self.A['postcode'].astype(float) self.B['postcode'] = self.B['postcode'].astype(float) # make pairs c_pairs = rl.Pairs(self.A, self.B) self.pairs = c_pairs.full()[0:5e4]
def setup(self): # download data self.A, self.B = load_febrl4() # make pairs c_pairs = rl.Pairs(self.A, self.B) pairs = c_pairs.full() # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def setup(self): # download data self.A, self.B = load_febrl4() # make pairs c_pairs = rl.FullIndex() pairs = c_pairs.index(self.A, self.B) # different sizes of pairs self.pairs_xsmall = pairs[0:5e3] self.pairs_small = pairs[0:5e4] self.pairs_medium = pairs[0:5e5] self.pairs_large = pairs[0:5e6]
def test_annotation_link(tmp_path): path = tmp_path / "febrl_annotation_link.json" # get febrl4 file df_a, df_b, matches = load_febrl4(return_links=True) # get record pairs indexer = Block("given_name", "given_name") pairs = indexer.index(df_a, df_b) # create annotation file # write an annotation file for the Febrl4 dataset. rl.write_annotation_file(path, pairs[0:10], df_a, df_b) # read the result result = rl.read_annotation_file(path) assert result.links is None assert result.distinct is None
def test_full_iter_index_linking(self): dfA, dfB = datasets.load_febrl4() index_chucks = recordlinkage.Pairs(dfA, dfB, chunks=(100,200)) index = recordlinkage.Pairs(dfA, dfB) # Compute pairs in one iteration pairs_single = index.full() # Compute pairs in iterations n_pairs_iter = 0 for pairs in index_chucks.full(): print (len(pairs)) n_pairs_iter += len(pairs) # Check if index is unique self.assertTrue(pairs.is_unique) self.assertEqual(len(pairs_single), n_pairs_iter) # Check is number of pairs is correct self.assertEqual(n_pairs_iter, len(dfA)*len(dfB))
def setUpClass(self): self.A, self.B = datasets.load_febrl4()
def setup(self): # download data self.A, self.B = load_febrl4()
method='levenshtein', threshold=0.7, label='stocazzo') features = compare.compute(candidate_pairs, discogs_df, wikidata_df) features discogs_df[304] discogs_df.catalog_id features.describe() features.sum(axis=1).value_counts().sort_index(ascending=False) features[features.sum(axis=1) > 3] features[features.sum(axis=1) > 3] features.sum(axis=1) features[features.sum(axis=1) > 1] features[features.sum(axis=1) > 0] from recordlinkage.datasets import load_febrl4 a, b = load_febrl4() a b idx = recordlinkage.Index() idx.block('given_name') candidates = idx.index(a, b) compare = recordlinkage.Compare() get_ipython().run_line_magic('cpaste', '') compare_cl = recordlinkage.Compare() get_ipython().run_line_magic('cpaste', '') features = compare.compute(candidates, a, b) features = compare_cl.compute(candidates, a, b) features features.describe() features.sum(axis=1).value_counts().sort_index(ascending=False) features
''' from __future__ import print_function import recordlinkage as rl from recordlinkage.index import Block from recordlinkage.compare import Exact, String from recordlinkage.datasets import load_febrl4 # set logging rl.logging.set_verbosity(rl.logging.INFO) # load datasets print('Loading data...') dfA, dfB, true_links = load_febrl4(return_links=True) print(len(dfA), 'records in dataset A') print(len(dfB), 'records in dataset B') print(len(true_links), 'links between dataset A and B') # start indexing print('Build index...') indexer = rl.Index() indexer.add(Block('given_name')) indexer.add(Block('surname')) indexer.add(Block('soc_sec_id')) candidate_links = indexer.index(dfA, dfB) # start comparing print('Start comparing...') comparer = rl.Compare()
def setup(self): # download data self.A, self.B = load_febrl4()
#Predict pred = classifier.predict(df_comparison_results) # Convert to Df for readability df = pd.DataFrame([pred]).transpose() df.head() del (train, list_text_data, list_int_data, indexer, candidate_links, compare_rl, df_comparison_results, list_features, classifier, pred, df) #%% Entity resolution /Deduplication from two table # https://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html from recordlinkage.datasets import load_febrl4 #load data train_one, train_two = load_febrl4() train_one.columns = map(str.upper, train_one.columns) train_two.columns = map(str.upper, train_two.columns) # First view train_one.shape, train_two.shape train_one.dtypes train_two.dtypes train_one.info() train_two.info() # Make list of data types and convert to approprite data types list_text_data = [ 'GIVEN_NAME', 'SURNAME', 'STREET_NUMBER', 'ADDRESS_1', 'ADDRESS_2', 'SUBURB', 'STATE' ]
import numpy as np import pandas as pd import itertools # import modules necessary for schema definition import clkhash from clkhash.field_formats import * from clkhash.schema import Schema from clkhash.comparators import NgramComparison from anonlinkclient.utils import generate_clk_from_csv, generate_candidate_blocks_from_csv, combine_clks_blocks import recordlinkage from recordlinkage.datasets import load_febrl4 dfA, dfB = load_febrl4() print(dfA.head()) print(dfA.columns) a_csv = io.StringIO() dfA.to_csv(a_csv) """ fields = [ Ignore('rec_id'), StringSpec('given_name', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))), StringSpec('surname', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))), IntegerSpec('street_number', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(300), missing_value=MissingValueSpec(sentinel=''))), StringSpec('address_1', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))), StringSpec('address_2', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))), StringSpec('suburb', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))), IntegerSpec('postcode', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(300))), StringSpec('state', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
def init(self): #Read data dfA_complete, dfB_complete, true_links_complete = load_febrl4(return_links=True) logger.info("Sample record: %s", str(dfA_complete[:1])) logger.info("Shape of datasets: A = %s B = %s", str(dfA_complete.shape), str(dfB_complete.shape)) logger.info("Shape of True Links: %s", str(true_links_complete.shape)) # Split test & train dataset self.testDataA = [] self.testDataB = [] self.trainDataA = [] self.trainDataB = [] self.valDataA = [] self.valDataB = [] for rec_id, row in dfA_complete.iterrows(): id = int(rec_id.split('-')[1]) if id < 3000: self.trainDataA.append(row) elif id < 4500: self.testDataA.append(row) else: self.valDataA.append(row) for rec_id, row in dfB_complete.iterrows(): id = int(rec_id.split('-')[1]) if id < 3000: self.trainDataB.append(row) elif id < 4500: self.testDataB.append(row) else: self.valDataB.append(row) self.trainDataA = pd.DataFrame(data=self.trainDataA) self.trainDataB = pd.DataFrame(data=self.trainDataB) self.testDataA = pd.DataFrame(data=self.testDataA) self.testDataB = pd.DataFrame(data=self.testDataB) self.valDataA = pd.DataFrame(data=self.valDataA) self.valDataB = pd.DataFrame(data=self.valDataB) logger.info("Train DataA shape: %s", str(self.trainDataA.shape)) logger.info("Train DataB shape: %s", str(self.trainDataB.shape)) logger.info("Test DataA shape: %s", str(self.testDataA.shape)) logger.info("Test DataB shape: %s", str(self.testDataB.shape)) logger.info("Val DataA shape: %s", str(self.valDataA.shape)) logger.info("Val DataB shape: %s", str(self.valDataB.shape)) # Compute candidate links for training indexer = recordlinkage.Index() indexer.block('given_name') self.candidate_links = indexer.index(self.trainDataA, self.trainDataB) logger.info("Training Candidate Pairs: %d", (len(self.candidate_links))) #Extract True Links true_links_train = [] for i in self.candidate_links: if i in true_links_complete: true_links_train.append(i) self.true_links = pd.MultiIndex.from_tuples(true_links_train) logger.info("True Pairs: %d", (len(self.true_links))) # Compute candidate links for testing indexer = recordlinkage.Index() indexer.block('given_name') self.test_links = indexer.index(self.testDataA, self.testDataB) logger.info("Testing Candidate Pairs: %d", (len(self.test_links))) #Extract True Links true_links_test = [] for i in self.test_links: if i in true_links_complete: true_links_test.append(i) self.true_test_links = pd.MultiIndex.from_tuples(true_links_test) logger.info("True Test Pairs: %d", (len(self.true_test_links))) # Compute candidate links for validation indexer = recordlinkage.Index() indexer.block('given_name') self.val_links = indexer.index(self.valDataA, self.valDataB) logger.info("Validation Candidate Pairs: %d", (len(self.val_links))) #Extract True Links true_links_val = [] for i in self.val_links: if i in true_links_complete: true_links_val.append(i) self.true_val_links = pd.MultiIndex.from_tuples(true_links_val) logger.info("True Validation Pairs: %d", (len(self.true_val_links)))
def main(): dfA, dfB = load_febrl4() indexer = recordlinkage.Index() indexer.full() pairs = indexer.index(dfA, dfB) return