def test_single_blocking_key(self):
        """BLOCKING: Test class arguments."""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block('var_arange')
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(on='var_arange')
        pairs2 = index_cl2.index((self.a, self.b))

        # situation 3
        index_cl3 = Block(left_on='var_arange', right_on='var_arange')
        pairs3 = index_cl3.index((self.a, self.b))

        # situation 4
        index_cl4 = Block(on=['var_arange'])
        pairs4 = index_cl4.index((self.a, self.b))

        # situation 5
        index_cl5 = Block(left_on=['var_arange'], right_on=['var_arange'])
        pairs5 = index_cl5.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
        ptm.assert_index_equal(pairs1, pairs3)
        ptm.assert_index_equal(pairs1, pairs4)
        ptm.assert_index_equal(pairs1, pairs5)
Beispiel #2
0
    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        index_cl_old = Block(on='var_arange')
        pairs_old = index_cl_old.index(self.a)

        ptm.assert_index_equal(pairs_new, pairs_old)
Beispiel #3
0
    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        with pytest.deprecated_call():
            index_cl_old = Block(on='var_arange')
            pairs_old = index_cl_old.index(self.a)

        pdt.assert_index_equal(pairs_new, pairs_old)
    def test_multiple_blocking_keys(self):
        """BLOCKING: test multiple blocking keys"""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block(['var_arange', 'var_block10'])
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(left_on=['var_arange', 'var_block10'],
                          right_on=['var_arange', 'var_block10'])
        pairs2 = index_cl2.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
    def test_blocking_algorithm_link(self):
        """BLOCKING: test blocking algorithm for linking"""

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index((self.a, self.b))
        assert len(pairs1) == len(self.a)
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index((self.a, self.b))
        assert len(pairs2) == len(self.a) * 10
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index((self.a, self.b))
        assert len(pairs3) == len(self.a) * len(self.b)
        assert pairs3.is_unique
    def test_blocking_algorithm_dedup(self):
        """BLOCKING: test blocking algorithm for deduplication"""

        len_a = len(self.a)

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index(self.a)
        assert len(pairs1) == 0
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index(self.a)
        assert len(pairs2) == (len_a * 10 - len_a) / 2
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index(self.a)
        assert len(pairs3) == (len_a * len_a - len_a) / 2
        assert pairs3.is_unique
    def test_add_dedup(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a).union(indexer2.index(self.a))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a)

        ptm.assert_index_equal(result, expected)
Beispiel #8
0
def processing(df, sourceid):
    if sourceid == 1:
        postal_indexer = Block('PostCodeKey')
        postal_pairs = postal_indexer.index(df)
        for i in [20, 40, 60, 80, 100]:
            if (len(postal_pairs) / i) < 1000000:
                intervalparts = i
                break
            else:
                intervalparts = 100
# Get Interval Parts
        inter = intervals(intervalparts, len(postal_pairs))

        comp_postal = recordlinkage.Compare(n_jobs=20)
        comp_postal.string('BusinessNameKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BusinesNameCompare')
        comp_postal.string('TradestyleKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BNTSCompare')
        comp_postal.string('AddressKey',
                           'AddressKey',
                           method='jarowinkler',
                           label='AddressCompare')

        cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df)
        cv_full = cv_full[
            ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True))
             | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True)))
            & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))]
        for i in range(1, len(inter) - 1):
            cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]],
                                     df)
            cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True))
                     | (cv.BNTSCompare.between(0.95, 1, inclusive=True)))
                    & (cv.AddressCompare.between(0.95, 1, inclusive=True))]
            frames = [cv_full, cv]
            cv_full = pd.concat(frames)
            del cv

#        print(df.columns)
#        print(cv_full.columns)
        return df, cv_full
Beispiel #9
0
def test_annotation_link(tmp_path):

    path = tmp_path / "febrl_annotation_link.json"

    # get febrl4 file
    df_a, df_b, matches = load_febrl4(return_links=True)

    # get record pairs
    indexer = Block("given_name", "given_name")
    pairs = indexer.index(df_a, df_b)

    # create annotation file
    # write an annotation file for the Febrl4 dataset.
    rl.write_annotation_file(path, pairs[0:10], df_a, df_b)

    # read the result
    result = rl.read_annotation_file(path)

    assert result.links is None
    assert result.distinct is None
Beispiel #10
0
df.head(6)

import recordlinkage as rl
from recordlinkage.index import Full

full_indexer = Full()
pairs = full_indexer.index(df)

print(f"Full index: {len(df)} records, {len(pairs)} pairs")



from recordlinkage.index import Block

postal_indexer = Block('postal')
pairs = postal_indexer.index(df)

print(f"Postal index: {len(pairs)} pairs")

pairs.to_frame()[:10].values

pd.DataFrame([[0.5, 0.8, 0.9, 1]],
             columns=['name', 'addr', 'postal', 'latlng'],
             index=pd.MultiIndex.from_arrays([[100], [200]]))



comp = rl.Compare()
comp.string('name', 'name', method='jarowinkler', label='name')
comp.string('addr', 'addr', method='jarowinkler', label='addr')
comp.string('postal', 'postal', method='jarowinkler', label='postal')