Example #1
0
    def test_datasets_existance(self):

        # Load all datasets
        load_febrl1()
        load_febrl2()
        load_febrl3()
        load_febrl4()
Example #2
0
    def test_febrl1(self):

        df = load_febrl1()
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 1000)

        df, links = load_febrl1(return_links=True)
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 1000)
        self.assertIsInstance(links, pandas.MultiIndex)
Example #3
0
    def test_full_iter_index_deduplication(self):

        dfA = datasets.load_febrl1()

        # Compute pairs in one iteration
        index = recordlinkage.Pairs(dfA)
        pairs_single = index.full()

        # Compute pairs in iterations
        n_pairs_iter = 0

        index_chucks = recordlinkage.Pairs(dfA, chunks=100)

        for pairs in index_chucks.full():

            print (len(pairs))
            n_pairs_iter += len(pairs)

            # Check if index is unique
            self.assertTrue(pairs.is_unique)

        self.assertEqual(len(pairs_single), n_pairs_iter)

        # Check is number of pairs is correct
        self.assertEqual(n_pairs_iter, (len(dfA)-1)*len(dfA)/2)
Example #4
0
    def setup(self):

        # download data
        self.A = load_febrl1()

        # make pairs
        c_pairs = rl.FullIndex()
        pairs = c_pairs.index(self.A)

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
    def setup(self):

        # download data
        self.A = load_febrl1()

        # make pairs
        c_pairs = rl.Pairs(self.A)
        pairs = c_pairs.full()

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
Example #6
0
def test_annotation_dedup(tmp_path):

    path = tmp_path / "febrl_annotation_dedup.json"

    # get febrl4 file
    df_a, matches = load_febrl1(return_links=True)

    # get record pairs
    indexer = Block("given_name", "given_name")
    pairs = indexer.index(df_a)

    # create annotation file
    # write an annotation file for the Febrl4 dataset.
    rl.write_annotation_file(path, pairs[0:10], df_a)

    # read the result
    result = rl.read_annotation_file(path)

    assert result.links is None
    assert result.distinct is None
Example #7
0
    def setup(self):

        # download data
        self.A = load_febrl1()
Example #8
0
    def setup(self):

        # download data
        self.A = load_febrl1()
Example #9
0
# Working directory
os.chdir("D:/trainings/NLP")

# fix random seed for reproducibility
seed_value = 123
np.random.seed(seed_value)

#%%  Entity resolution /Deduplication from single table
#Data sets are at link https://recordlinkage.readthedocs.io/en/latest/ref-datasets.html
from recordlinkage.datasets import load_febrl1
#This data set contains 1000 records (500 original and 500 duplicates, with exactly
#one duplicate per original record.

#load data
train = load_febrl1()
train.columns = map(str.upper, train.columns)

# First view
train.shape
train.dtypes
train.info()

# Make list of data types and convert to approprite data types
list_text_data = [
    'GIVEN_NAME', 'SURNAME', 'STREET_NUMBER', 'ADDRESS_1', 'ADDRESS_2',
    'SUBURB', 'STATE'
]
list_int_data = ['POSTCODE', 'SOC_SEC_ID']
train[list_text_data] = train[list_text_data].apply(lambda x: x.astype(str))
train[list_int_data] = train[list_int_data].apply(lambda x: x.astype(int))