def test_add_dedup(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a).union(indexer2.index(self.a))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a)

        ptm.assert_index_equal(result, expected)
    def _link_index(self, df_a, df_b):
        indexer = recordlinkage.Index()

        for blocking_keys in self.block_on:
            indexer.add(Block(blocking_keys))

        return indexer.index(df_a, df_b)
Beispiel #3
0
    def _dedup_index(self, df_a):

        indexer = rl.Index()

        for blocking_keys in self.block_on:
            indexer.add(Block(blocking_keys))

        return indexer.index(df_a)
Beispiel #4
0
def processing(df, sourceid):
    if sourceid == 1:
        postal_indexer = Block('PostCodeKey')
        postal_pairs = postal_indexer.index(df)
        for i in [20, 40, 60, 80, 100]:
            if (len(postal_pairs) / i) < 1000000:
                intervalparts = i
                break
            else:
                intervalparts = 100
# Get Interval Parts
        inter = intervals(intervalparts, len(postal_pairs))

        comp_postal = recordlinkage.Compare(n_jobs=20)
        comp_postal.string('BusinessNameKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BusinesNameCompare')
        comp_postal.string('TradestyleKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BNTSCompare')
        comp_postal.string('AddressKey',
                           'AddressKey',
                           method='jarowinkler',
                           label='AddressCompare')

        cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df)
        cv_full = cv_full[
            ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True))
             | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True)))
            & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))]
        for i in range(1, len(inter) - 1):
            cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]],
                                     df)
            cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True))
                     | (cv.BNTSCompare.between(0.95, 1, inclusive=True)))
                    & (cv.AddressCompare.between(0.95, 1, inclusive=True))]
            frames = [cv_full, cv]
            cv_full = pd.concat(frames)
            del cv

#        print(df.columns)
#        print(cv_full.columns)
        return df, cv_full
Beispiel #5
0
 def test_dedup_single_blocking_key_vs_block(self):
     indexers = [
         NeighbourhoodBlock('var_block10', max_nulls=1),
         NeighbourhoodBlock(
             left_on='var_block10', right_on='var_block10', max_nulls=1),
         Block('var_block10'),
     ]
     self.assert_index_comparisons(eq, indexers, self.a)
     self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
def get_test_algorithms():
    """Return list of algorithms"""
    return [
        Full(),
        Block(on='var_arange'),
        SortedNeighbourhood(on='var_arange'),
        Random(10, random_state=100, replace=True),
        Random(10, random_state=100, replace=False)
    ]
 def test_dedup_multiple_blocking_keys_vs_Block(self):
     indexers = [
         NeighbourhoodBlock(['var_single', 'var_block10'], max_nulls=1),
         NeighbourhoodBlock(left_on=['var_single', 'var_block10'],
                            right_on=['var_single', 'var_block10'],
                            max_nulls=1),
         Block(['var_single', 'var_block10']),
     ]
     self.assert_index_comparisons(eq, indexers, self.a)
     self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a)
Beispiel #8
0
def test_annotation_link(tmp_path):

    path = tmp_path / "febrl_annotation_link.json"

    # get febrl4 file
    df_a, df_b, matches = load_febrl4(return_links=True)

    # get record pairs
    indexer = Block("given_name", "given_name")
    pairs = indexer.index(df_a, df_b)

    # create annotation file
    # write an annotation file for the Febrl4 dataset.
    rl.write_annotation_file(path, pairs[0:10], df_a, df_b)

    # read the result
    result = rl.read_annotation_file(path)

    assert result.links is None
    assert result.distinct is None
 def test_link_single_blocking_key_vs_Block(self):
     indexers = [
         NeighbourhoodBlock('var_arange', max_nulls=1),
         NeighbourhoodBlock(left_on='var_arange',
                            right_on='var_arange',
                            max_nulls=1),
         Block('var_arange'),
     ]
     self.assert_index_comparisons(eq, indexers, self.a, self.b)
     self.assert_index_comparisons(gt, indexers[-2:], self.incomplete_a,
                                   self.incomplete_b)
Beispiel #10
0
def ProcessData(patientDataList, fetchedHospitalData):
    # Read from the directory
    filelist = pd.read_csv(
        '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/'
        + fetchedHospitalData)

    # Indexation step
    indexer = p.Index()
    indexer.add(Block(left_on='fatherName', right_on='fatherName'))
    candidate_links = indexer.index(patientDataList, filelist)

    # print((candidate_links))

    # Comparison step
    compare_cl = p.Compare()

    # compare_cl.exact('_id','_id',label='_id')
    compare_cl.exact('name', 'name', label='name')
    compare_cl.exact('fatherName', 'fatherName', label='fatherName')
    compare_cl.exact('grandFatherName',
                     'grandFatherName',
                     label='grandFatherName')
    compare_cl.exact('gender', 'gender', label='gender')
    compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth')
    compare_cl.exact('dayOfBirth', 'dayOfBirth', label='dayOfBirth')
    compare_cl.exact('monthOfBirth', 'monthOfBirth', label='monthOfBirth')
    compare_cl.exact('yearOfBirth', 'yearOfBirth', label='yearOfBirth')
    compare_cl.exact('age', 'age', label='age')
    # compare_cl.exact('address','address',label='address')
    # compare_cl.exact('phoneNumber','phoneNumber',label='phoneNumber')

    features = compare_cl.compute(candidate_links, patientDataList, filelist)

    if features.empty:
        return None
    else:

        # Classification step
        '''
            Use the KMeans Classifier
            This classifier is equivalent to the Unsupervised record linkage approach
        '''

        # # classifier = p.LogisticRegressionClassifier(coefficients=coefficients,intercept=intercept)
        classifier = p.LogisticRegressionClassifier()
        classifier.fit(golden_pairs, golden_matches_index)

        links = classifier.predict(features)

        return links
Beispiel #11
0
    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        index_cl_old = Block(on='var_arange')
        pairs_old = index_cl_old.index(self.a)

        ptm.assert_index_equal(pairs_new, pairs_old)
Beispiel #12
0
    def test_blocking_algorithm_link(self):
        """BLOCKING: test blocking algorithm for linking"""

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index((self.a, self.b))
        assert len(pairs1) == len(self.a)
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index((self.a, self.b))
        assert len(pairs2) == len(self.a) * 10
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index((self.a, self.b))
        assert len(pairs3) == len(self.a) * len(self.b)
        assert pairs3.is_unique
Beispiel #13
0
    def block(self, *args, **kwargs):
        """Add a block index.

        Shortcut of :class:`recordlinkage.index.Block`::

            from recordlinkage.index import Block

            indexer = recordlinkage.Index()
            indexer.add(Block())

        """
        indexer = Block(*args, **kwargs)
        self.add(indexer)

        return self
Beispiel #14
0
    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        with pytest.deprecated_call():
            index_cl_old = Block(on='var_arange')
            pairs_old = index_cl_old.index(self.a)

        pdt.assert_index_equal(pairs_new, pairs_old)
Beispiel #15
0
    def test_blocking_algorithm_dedup(self):
        """BLOCKING: test blocking algorithm for deduplication"""

        len_a = len(self.a)

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index(self.a)
        assert len(pairs1) == 0
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index(self.a)
        assert len(pairs2) == (len_a * 10 - len_a) / 2
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index(self.a)
        assert len(pairs3) == (len_a * len_a - len_a) / 2
        assert pairs3.is_unique
Beispiel #16
0
    def test_multiple_blocking_keys(self):
        """BLOCKING: test multiple blocking keys"""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block(['var_arange', 'var_block10'])
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(left_on=['var_arange', 'var_block10'],
                          right_on=['var_arange', 'var_block10'])
        pairs2 = index_cl2.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
Beispiel #17
0
def build_indexer(dview: pd.DataFrame, exclude=['gender_pool']):
    # Identify which columns to index and how to do so
    blocking_columns = [
        col for col in dview.columns if compare_in(col, blocked_identifiers)
    ]
    sngb_columns = [
        col for col in dview.columns
        if compare_in(col, sneighbourhood_identifiers)
    ]

    # Build the indexer
    indexer = Index()

    # Add sorted neighbour conditions
    for col in sngb_columns:
        if not compare_in(col, exclude):
            indexer.add(SortedNeighbourhood(col))
    # Add blocking conditions
    for col in blocking_columns:
        indexer.add(Block(col, col))
    indexlogger.info(f'Constructed indexer: \n{indexer.algorithms}')
    return indexer
class TestIndexAlgorithmApi(TestData):
    """General unittest for the indexing API."""
    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        from recordlinkage.index import Random

        if not isinstance(index_class, Random):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(index={self.a.index[1]: self.a.index[0]},
                             inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [('index1', 'index2'),
                       ('index1', None), (None, 'index2'), (None, None),
                       (10, 'index2'), (10, 11)]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))
Beispiel #19
0
df = assign_postal_lat_lng(df)
df.head(6)

import recordlinkage as rl
from recordlinkage.index import Full

full_indexer = Full()
pairs = full_indexer.index(df)

print(f"Full index: {len(df)} records, {len(pairs)} pairs")



from recordlinkage.index import Block

postal_indexer = Block('postal')
pairs = postal_indexer.index(df)

print(f"Postal index: {len(pairs)} pairs")

pairs.to_frame()[:10].values

pd.DataFrame([[0.5, 0.8, 0.9, 1]],
             columns=['name', 'addr', 'postal', 'latlng'],
             index=pd.MultiIndex.from_arrays([[100], [200]]))



comp = rl.Compare()
comp.string('name', 'name', method='jarowinkler', label='name')
comp.string('addr', 'addr', method='jarowinkler', label='addr')
Beispiel #20
0
from recordlinkage.compare import Exact, String
from recordlinkage.datasets import load_febrl3

# set logging
rl.logging.set_verbosity(rl.logging.INFO)

# load dataset
print('Loading data...')
dfA, true_links = load_febrl3(return_links=True)
print(len(dfA), 'records in dataset A')
print(len(true_links), 'links in dataset A')

# start indexing
print('Build index...')
indexer = rl.Index()
indexer.add(Block('given_name'))
indexer.add(Block('surname'))
indexer.add(Block('soc_sec_id'))
candidate_links = indexer.index(dfA)

# start comparing
print('Start comparing...')
comparer = rl.Compare()
comparer.add(Exact('given_name', 'given_name', label='given_name'))
comparer.add(String('surname', 'surname', method='jarowinkler',
                    threshold=0.85, label='surname'))
comparer.add(Exact('date_of_birth', 'date_of_birth', label='date_of_birth'))
comparer.add(Exact('suburb', 'suburb', label='suburb'))
comparer.add(Exact('state', 'state', label='state'))
comparer.add(String('address_1', 'address_1', threshold=0.85,
                    label='address_1'))
import unittest
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage
from recordlinkage.index import Full, Block, SortedNeighbourhood, Random

TEST_INDEXATION_OBJECTS = [
    param(Full()),
    param(Block(on='var_arange')),
    param(SortedNeighbourhood(on='var_arange')),
    param(Random(10, random_state=100, replace=True)),
    param(Random(10, random_state=100, replace=False))
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150

        cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)]
Beispiel #22
0
    def test_single_blocking_key(self):
        """BLOCKING: Test class arguments."""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block('var_arange')
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(on='var_arange')
        pairs2 = index_cl2.index((self.a, self.b))

        # situation 3
        index_cl3 = Block(left_on='var_arange', right_on='var_arange')
        pairs3 = index_cl3.index((self.a, self.b))

        # situation 4
        index_cl4 = Block(on=['var_arange'])
        pairs4 = index_cl4.index((self.a, self.b))

        # situation 5
        index_cl5 = Block(left_on=['var_arange'], right_on=['var_arange'])
        pairs5 = index_cl5.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
        ptm.assert_index_equal(pairs1, pairs3)
        ptm.assert_index_equal(pairs1, pairs4)
        ptm.assert_index_equal(pairs1, pairs5)