Python Block.index Examples

Programming Language: Python

Namespace/Package Name: recordlinkage.index

Class/Type: Block

Method/Function: index

Examples at hotexamples.com: 10

Python Block.index - 10 examples found. These are the top rated real world Python examples of recordlinkage.index.Block.index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Block(17)

index(10)

swaplevel(1)

Frequently Used Methods

Block (17)

index (10)

swaplevel (1)

Example #1

Show file

File: test_indexing.py Project: srane163/recordlinkage

    def test_single_blocking_key(self):
        """BLOCKING: Test class arguments."""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block('var_arange')
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(on='var_arange')
        pairs2 = index_cl2.index((self.a, self.b))

        # situation 3
        index_cl3 = Block(left_on='var_arange', right_on='var_arange')
        pairs3 = index_cl3.index((self.a, self.b))

        # situation 4
        index_cl4 = Block(on=['var_arange'])
        pairs4 = index_cl4.index((self.a, self.b))

        # situation 5
        index_cl5 = Block(left_on=['var_arange'], right_on=['var_arange'])
        pairs5 = index_cl5.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
        ptm.assert_index_equal(pairs1, pairs3)
        ptm.assert_index_equal(pairs1, pairs4)
        ptm.assert_index_equal(pairs1, pairs5)

Example #2

Show file

    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        index_cl_old = Block(on='var_arange')
        pairs_old = index_cl_old.index(self.a)

        ptm.assert_index_equal(pairs_new, pairs_old)

Example #3

Show file

    def test_depr_on_argument(self):

        index_cl_new = Block('var_arange')
        pairs_new = index_cl_new.index(self.a)

        with pytest.deprecated_call():
            index_cl_old = Block(on='var_arange')
            pairs_old = index_cl_old.index(self.a)

        pdt.assert_index_equal(pairs_new, pairs_old)

Example #4

Show file

File: test_indexing.py Project: srane163/recordlinkage

    def test_multiple_blocking_keys(self):
        """BLOCKING: test multiple blocking keys"""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = Block(['var_arange', 'var_block10'])
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = Block(left_on=['var_arange', 'var_block10'],
                          right_on=['var_arange', 'var_block10'])
        pairs2 = index_cl2.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)

Example #5

Show file

File: test_indexing.py Project: srane163/recordlinkage

    def test_blocking_algorithm_link(self):
        """BLOCKING: test blocking algorithm for linking"""

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index((self.a, self.b))
        assert len(pairs1) == len(self.a)
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index((self.a, self.b))
        assert len(pairs2) == len(self.a) * 10
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index((self.a, self.b))
        assert len(pairs3) == len(self.a) * len(self.b)
        assert pairs3.is_unique

Example #6

Show file

File: test_indexing.py Project: srane163/recordlinkage

    def test_blocking_algorithm_dedup(self):
        """BLOCKING: test blocking algorithm for deduplication"""

        len_a = len(self.a)

        # situation 1: eye index
        index_cl1 = Block(on='var_arange')
        pairs1 = index_cl1.index(self.a)
        assert len(pairs1) == 0
        assert pairs1.is_unique

        # situation 2: 10 blocks
        index_cl2 = Block(on='var_block10')
        pairs2 = index_cl2.index(self.a)
        assert len(pairs2) == (len_a * 10 - len_a) / 2
        assert pairs2.is_unique

        # situation 3: full index
        index_cl3 = Block(on='var_single')
        pairs3 = index_cl3.index(self.a)
        assert len(pairs3) == (len_a * len_a - len_a) / 2
        assert pairs3.is_unique

Example #7

Show file

File: test_indexing.py Project: srane163/recordlinkage

    def test_add_dedup(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a).union(indexer2.index(self.a))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a)

        ptm.assert_index_equal(result, expected)

Example #8

Show file

def processing(df, sourceid):
    if sourceid == 1:
        postal_indexer = Block('PostCodeKey')
        postal_pairs = postal_indexer.index(df)
        for i in [20, 40, 60, 80, 100]:
            if (len(postal_pairs) / i) < 1000000:
                intervalparts = i
                break
            else:
                intervalparts = 100
# Get Interval Parts
        inter = intervals(intervalparts, len(postal_pairs))

        comp_postal = recordlinkage.Compare(n_jobs=20)
        comp_postal.string('BusinessNameKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BusinesNameCompare')
        comp_postal.string('TradestyleKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BNTSCompare')
        comp_postal.string('AddressKey',
                           'AddressKey',
                           method='jarowinkler',
                           label='AddressCompare')

        cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df)
        cv_full = cv_full[
            ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True))
             | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True)))
            & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))]
        for i in range(1, len(inter) - 1):
            cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]],
                                     df)
            cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True))
                     | (cv.BNTSCompare.between(0.95, 1, inclusive=True)))
                    & (cv.AddressCompare.between(0.95, 1, inclusive=True))]
            frames = [cv_full, cv]
            cv_full = pd.concat(frames)
            del cv

#        print(df.columns)
#        print(cv_full.columns)
        return df, cv_full

Example #9

Show file

def test_annotation_link(tmp_path):

    path = tmp_path / "febrl_annotation_link.json"

    # get febrl4 file
    df_a, df_b, matches = load_febrl4(return_links=True)

    # get record pairs
    indexer = Block("given_name", "given_name")
    pairs = indexer.index(df_a, df_b)

    # create annotation file
    # write an annotation file for the Febrl4 dataset.
    rl.write_annotation_file(path, pairs[0:10], df_a, df_b)

    # read the result
    result = rl.read_annotation_file(path)

    assert result.links is None
    assert result.distinct is None

Example #10

Show file

df.head(6)

import recordlinkage as rl
from recordlinkage.index import Full

full_indexer = Full()
pairs = full_indexer.index(df)

print(f"Full index: {len(df)} records, {len(pairs)} pairs")



from recordlinkage.index import Block

postal_indexer = Block('postal')
pairs = postal_indexer.index(df)

print(f"Postal index: {len(pairs)} pairs")

pairs.to_frame()[:10].values

pd.DataFrame([[0.5, 0.8, 0.9, 1]],
             columns=['name', 'addr', 'postal', 'latlng'],
             index=pd.MultiIndex.from_arrays([[100], [200]]))



comp = rl.Compare()
comp.string('name', 'name', method='jarowinkler', label='name')
comp.string('addr', 'addr', method='jarowinkler', label='addr')
comp.string('postal', 'postal', method='jarowinkler', label='postal')