Beispiel #1
0
    def test_datasets_existance(self):

        # Load all datasets
        load_febrl1()
        load_febrl2()
        load_febrl3()
        load_febrl4()
Beispiel #2
0
    def test_febrl4(self):

        dfa, dfb = load_febrl4()
        self.assertIsInstance(dfa, pandas.DataFrame)
        self.assertIsInstance(dfb, pandas.DataFrame)
        self.assertEqual(len(dfa), 5000)
        self.assertEqual(len(dfb), 5000)

        dfa, dfb, links = load_febrl4(return_links=True)
        self.assertIsInstance(dfa, pandas.DataFrame)
        self.assertIsInstance(dfb, pandas.DataFrame)
        self.assertEqual(len(dfa), 5000)
        self.assertEqual(len(dfb), 5000)
        self.assertIsInstance(links, pandas.MultiIndex)
Beispiel #3
0
def test_febrl_links():
    dfa, dfb, links = load_febrl4(return_links=True)
    assert isinstance(dfa, pandas.DataFrame)
    assert isinstance(dfb, pandas.DataFrame)
    assert len(dfa) == 5000
    assert len(dfb) == 5000
    assert isinstance(links, pandas.MultiIndex)
Beispiel #4
0
def test_febrl4():

    dfa, dfb = load_febrl4()
    assert isinstance(dfa, pandas.DataFrame)
    assert isinstance(dfb, pandas.DataFrame)
    assert len(dfa) == 5000
    assert len(dfb) == 5000
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # Add numbers (age)
        self.A['postcode'] = self.A['postcode'].astype(float)
        self.B['postcode'] = self.B['postcode'].astype(float)

        # make pairs
        c_pairs = rl.FullIndex()
        self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # Add numbers (age)
        self.A['postcode'] = self.A['postcode'].astype(float)
        self.B['postcode'] = self.B['postcode'].astype(float)

        # make pairs
        c_pairs = rl.Pairs(self.A, self.B)
        self.pairs = c_pairs.full()[0:5e4]
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # make pairs
        c_pairs = rl.Pairs(self.A, self.B)
        pairs = c_pairs.full()

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # make pairs
        c_pairs = rl.FullIndex()
        pairs = c_pairs.index(self.A, self.B)

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
Beispiel #9
0
def test_annotation_link(tmp_path):

    path = tmp_path / "febrl_annotation_link.json"

    # get febrl4 file
    df_a, df_b, matches = load_febrl4(return_links=True)

    # get record pairs
    indexer = Block("given_name", "given_name")
    pairs = indexer.index(df_a, df_b)

    # create annotation file
    # write an annotation file for the Febrl4 dataset.
    rl.write_annotation_file(path, pairs[0:10], df_a, df_b)

    # read the result
    result = rl.read_annotation_file(path)

    assert result.links is None
    assert result.distinct is None
    def test_full_iter_index_linking(self):

        dfA, dfB = datasets.load_febrl4()

        index_chucks = recordlinkage.Pairs(dfA, dfB, chunks=(100,200))
        index = recordlinkage.Pairs(dfA, dfB)

        # Compute pairs in one iteration
        pairs_single = index.full()

        # Compute pairs in iterations
        n_pairs_iter = 0
        for pairs in index_chucks.full():

            print (len(pairs))
            n_pairs_iter +=  len(pairs)

            # Check if index is unique
            self.assertTrue(pairs.is_unique)

        self.assertEqual(len(pairs_single), n_pairs_iter)

        # Check is number of pairs is correct
        self.assertEqual(n_pairs_iter, len(dfA)*len(dfB))
    def setUpClass(self):

        self.A, self.B = datasets.load_febrl4()
Beispiel #12
0
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()
               method='levenshtein',
               threshold=0.7,
               label='stocazzo')
features = compare.compute(candidate_pairs, discogs_df, wikidata_df)
features
discogs_df[304]
discogs_df.catalog_id
features.describe()
features.sum(axis=1).value_counts().sort_index(ascending=False)
features[features.sum(axis=1) > 3]
features[features.sum(axis=1) > 3]
features.sum(axis=1)
features[features.sum(axis=1) > 1]
features[features.sum(axis=1) > 0]
from recordlinkage.datasets import load_febrl4
a, b = load_febrl4()
a
b
idx = recordlinkage.Index()
idx.block('given_name')
candidates = idx.index(a, b)
compare = recordlinkage.Compare()
get_ipython().run_line_magic('cpaste', '')
compare_cl = recordlinkage.Compare()
get_ipython().run_line_magic('cpaste', '')
features = compare.compute(candidates, a, b)
features = compare_cl.compute(candidates, a, b)
features
features.describe()
features.sum(axis=1).value_counts().sort_index(ascending=False)
features
'''

from __future__ import print_function

import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.compare import Exact, String
from recordlinkage.datasets import load_febrl4

# set logging
rl.logging.set_verbosity(rl.logging.INFO)

# load datasets
print('Loading data...')
dfA, dfB, true_links = load_febrl4(return_links=True)
print(len(dfA), 'records in dataset A')
print(len(dfB), 'records in dataset B')
print(len(true_links), 'links between dataset A and B')

# start indexing
print('Build index...')
indexer = rl.Index()
indexer.add(Block('given_name'))
indexer.add(Block('surname'))
indexer.add(Block('soc_sec_id'))
candidate_links = indexer.index(dfA, dfB)

# start comparing
print('Start comparing...')
comparer = rl.Compare()
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()
Beispiel #16
0
#Predict
pred = classifier.predict(df_comparison_results)

# Convert to Df for readability
df = pd.DataFrame([pred]).transpose()
df.head()

del (train, list_text_data, list_int_data, indexer, candidate_links,
     compare_rl, df_comparison_results, list_features, classifier, pred, df)
#%% Entity resolution /Deduplication from two table
# https://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html
from recordlinkage.datasets import load_febrl4

#load data
train_one, train_two = load_febrl4()
train_one.columns = map(str.upper, train_one.columns)
train_two.columns = map(str.upper, train_two.columns)

# First view
train_one.shape, train_two.shape
train_one.dtypes
train_two.dtypes
train_one.info()
train_two.info()

# Make list of data types and convert to approprite data types
list_text_data = [
    'GIVEN_NAME', 'SURNAME', 'STREET_NUMBER', 'ADDRESS_1', 'ADDRESS_2',
    'SUBURB', 'STATE'
]
Beispiel #17
0
import numpy as np
import pandas as pd
import itertools

# import modules necessary for schema definition
import clkhash
from clkhash.field_formats import *
from clkhash.schema import Schema
from clkhash.comparators import NgramComparison

from anonlinkclient.utils import generate_clk_from_csv, generate_candidate_blocks_from_csv, combine_clks_blocks

import recordlinkage
from recordlinkage.datasets import load_febrl4

dfA, dfB = load_febrl4()
print(dfA.head())
print(dfA.columns)
a_csv = io.StringIO()
dfA.to_csv(a_csv)

""" fields = [
    Ignore('rec_id'),
    StringSpec('given_name', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
    StringSpec('surname', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
    IntegerSpec('street_number', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(300), missing_value=MissingValueSpec(sentinel=''))),
    StringSpec('address_1', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
    StringSpec('address_2', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
    StringSpec('suburb', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
    IntegerSpec('postcode', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(300))),
    StringSpec('state', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(300))),
Beispiel #18
0
    def init(self):
        #Read data
        dfA_complete, dfB_complete, true_links_complete = load_febrl4(return_links=True)
        logger.info("Sample record: %s", str(dfA_complete[:1]))
        logger.info("Shape of datasets: A = %s  B = %s", str(dfA_complete.shape), str(dfB_complete.shape))
        logger.info("Shape of True Links: %s", str(true_links_complete.shape))

        # Split test & train dataset
        self.testDataA = []
        self.testDataB = []
        self.trainDataA = []
        self.trainDataB = []
        self.valDataA = []
        self.valDataB = []

        for rec_id, row in dfA_complete.iterrows():
            id = int(rec_id.split('-')[1])
            if id < 3000:
                self.trainDataA.append(row)
            elif id < 4500:
                self.testDataA.append(row)
            else:
                self.valDataA.append(row)

        for rec_id, row in dfB_complete.iterrows():
            id = int(rec_id.split('-')[1])
            if id < 3000:
                self.trainDataB.append(row)
            elif id < 4500:
                self.testDataB.append(row)
            else:
                self.valDataB.append(row)

        self.trainDataA = pd.DataFrame(data=self.trainDataA)
        self.trainDataB = pd.DataFrame(data=self.trainDataB)
        self.testDataA = pd.DataFrame(data=self.testDataA)
        self.testDataB = pd.DataFrame(data=self.testDataB)
        self.valDataA = pd.DataFrame(data=self.valDataA)
        self.valDataB = pd.DataFrame(data=self.valDataB)

        logger.info("Train DataA shape: %s", str(self.trainDataA.shape))
        logger.info("Train DataB shape: %s", str(self.trainDataB.shape))
        logger.info("Test DataA shape: %s", str(self.testDataA.shape))
        logger.info("Test DataB shape: %s", str(self.testDataB.shape))
        logger.info("Val DataA shape: %s", str(self.valDataA.shape))
        logger.info("Val DataB shape: %s", str(self.valDataB.shape))

        # Compute candidate links for training
        indexer = recordlinkage.Index()
        indexer.block('given_name')
        self.candidate_links = indexer.index(self.trainDataA, self.trainDataB)
        logger.info("Training Candidate Pairs: %d", (len(self.candidate_links)))

        #Extract True Links
        true_links_train = []
        for i in self.candidate_links:
            if i in true_links_complete:
                true_links_train.append(i)
        self.true_links = pd.MultiIndex.from_tuples(true_links_train)
        logger.info("True Pairs: %d", (len(self.true_links)))

        # Compute candidate links for testing
        indexer = recordlinkage.Index()
        indexer.block('given_name')
        self.test_links = indexer.index(self.testDataA, self.testDataB)
        logger.info("Testing Candidate Pairs: %d", (len(self.test_links)))

        #Extract True Links
        true_links_test = []
        for i in self.test_links:
            if i in true_links_complete:
                true_links_test.append(i)
        self.true_test_links = pd.MultiIndex.from_tuples(true_links_test)
        logger.info("True Test Pairs: %d", (len(self.true_test_links)))

         # Compute candidate links for validation
        indexer = recordlinkage.Index()
        indexer.block('given_name')
        self.val_links = indexer.index(self.valDataA, self.valDataB)
        logger.info("Validation Candidate Pairs: %d", (len(self.val_links)))

        #Extract True Links
        true_links_val = []
        for i in self.val_links:
            if i in true_links_complete:
                true_links_val.append(i)
        self.true_val_links = pd.MultiIndex.from_tuples(true_links_val)
        logger.info("True Validation Pairs: %d", (len(self.true_val_links)))
Beispiel #19
0
def main():
    dfA, dfB = load_febrl4()
    indexer = recordlinkage.Index()
    indexer.full()
    pairs = indexer.index(dfA, dfB)
    return