Example #1
0
    def test_init(self):

        algorithms = Full()
        indexer = recordlinkage.Index(algorithms)
        result = indexer.index(self.a, self.b)

        expected = Full().index(self.a, self.b)

        ptm.assert_index_equal(result, expected)
Example #2
0
    def test_basic_link(self):
        """FULL: Test basic characteristics of full indexing (link)."""

        from recordlinkage.index import Full

        # finding duplicates
        index_cl = Full()
        pairs = index_cl.index((self.a, self.b))

        assert isinstance(pairs, pd.MultiIndex)
        assert len(pairs) == len(self.a) * len(self.b)
        assert pairs.is_unique
    def test_basic_dedup(self):
        """FULL: Test basic characteristics of full indexing (dedup)."""

        from recordlinkage.index import Full

        # finding duplicates
        index_cl = Full()
        pairs = index_cl.index(self.a)

        self.assertIsInstance(pairs, pd.MultiIndex)
        self.assertEqual(len(pairs), len(self.a) * (len(self.a) - 1) / 2)
        self.assertTrue(pairs.is_unique)
Example #4
0
    def test_add_dedup(self):

        indexer1 = Full()
        indexer2 = Block(left_on='var_arange', right_on='var_arange')
        expected = indexer1.index(self.a).union(indexer2.index(self.a))

        indexer = recordlinkage.Index()
        indexer.add(
            [Full(),
             Block(left_on='var_arange', right_on='var_arange')])

        result = indexer.index(self.a)

        ptm.assert_index_equal(result, expected)
def match_affil(affiliation: str, k: int = 3):
    """
    Match affliation to GRID dataset.
    Return a da
    """
    parsed_affil = parse_affil(affiliation)
    df = pd.Dataframe([parsed_affil])

    indexer = recordlinkage.Index()
    indexer.add(Full())
    candidate_links = indexer.index(df, grid_df)

    # recordlinkage comparer
    compare = recordlinkage.Compare()
    compare.exact("institution", "institution")
    compare.string("location", "location", method="jarowinkler")
    compare.string("country", "country", method="jarowinkler")

    features_df = compare.compute(candidate_links, df, grid_df)
    features_df["score"] = np.average(features_df,
                                      axis=1,
                                      weights=[0.6, 0.2, 0.2])

    topk_df = features_df[["score"]].reset_index().sort_values(
        "score", ascending=False).head(k)
    topk_df = topk_df.merge(grid_df.reset_index(), left_on="level_1", right_on="index").\
        drop(labels=["level_0", "level_1", "location"], axis=1)

    return topk_df.to_dict(orient="records")
Example #6
0
def get_test_algorithms():
    """Return list of algorithms"""
    return [
        Full(),
        Block(on='var_arange'),
        SortedNeighbourhood(on='var_arange'),
        Random(10, random_state=100, replace=True),
        Random(10, random_state=100, replace=False)
    ]
    def submit(self):
        #self.progress_bar.start()
        ##self.df1_combobox.set('SpringAlmaOutput')
        ##self.df2_combobox.set('SpringBookstoreList')

        selected_threshold = self.threshold.get()
        input_file = self.file_path.get()

        df1 = pd.read_excel(input_file,
                            header=2,
                            sheet_name=self.df1_combobox.get())
        df2 = pd.read_excel(input_file, sheet_name=self.df2_combobox.get())
        indexer = rl.Index()
        indexer.add(Full())

        pairs = indexer.index(
            df1,
            df2,
        )
        print(len(pairs))

        comparer = rl.Compare()
        comparer.string('Title',
                        'Long Title',
                        threshold=float(selected_threshold) / 100,
                        label='Title')

        potential_matches = comparer.compute(pairs, df1, df2)
        matches = potential_matches[potential_matches.sum(
            axis=1) > 0].reset_index()
        #print(matches)

        accumulated = matches.loc[:, ['level_0', 'level_1']].merge(
            df1.loc[:, ['Title', 'ISBN']], left_on='level_0', right_index=True)
        accumulated = accumulated.merge(df2.loc[:,
                                                ['Long Title', 'Internal ID']],
                                        left_on='level_1',
                                        right_index=True)
        accumulated.head()
        accumulated.to_excel('{}-{}.xlsx'.format(
            path.basename(self.file_io.name), selected_threshold),
                             index=False,
                             columns=['Internal ID', 'Long Title', 'Title'])
        dfStyler = accumulated.style.set_properties(**{'text-align': 'left'})
        dfStyler.set_table_styles(
            [dict(selector='th', props=[('text-align', 'left')])])
        self.text_output.delete(1.0, 'end')
        self.text_output.insert(
            END,
            accumulated.to_string(
                index=False, columns=['Internal ID', 'Long Title', 'Title']))
Example #8
0
    def full(self):
        """Add a 'full' index.

        Shortcut of :class:`recordlinkage.index.Full`::

            from recordlinkage.index import Full

            indexer = recordlinkage.Index()
            indexer.add(Full())

        """
        indexer = Full()
        self.add(indexer)

        return self
Example #9
0
    def test_random_desc(self):

        df_a = pd.DataFrame({'v': list("abcde")})
        df_b = pd.DataFrame({'v': list("abcde")})

        pairs = Full().index(df_a, df_b)

        c = recordlinkage.Compare()
        c.exact("v", "v")
        c.add(RandomDiscrete(label='random'))
        cv = c.compute(pairs, df_a, df_b)

        assert isinstance(cv, pd.DataFrame)

        assert cv['random'].notnull().all()
        assert cv['random'].isin([0, 1]).all()
Example #10
0
    def test_random_cont(self):

        df_a = pd.DataFrame({'v': list("abcde")})
        df_b = pd.DataFrame({'v': list("abcde")})

        pairs = Full().index(df_a, df_b)

        c = recordlinkage.Compare()
        c.exact("v", "v")
        c.add(RandomContinuous(label='random'))
        cv = c.compute(pairs, df_a, df_b)

        assert isinstance(cv, pd.DataFrame)

        assert cv['random'].notnull().all()
        assert cv['random'].min() >= 0.0
        assert cv['random'].max() <= 1.0
Example #11
0
    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
Example #12
0
 def test_link_vs_full(self):
     indexers = [
         NeighbourhoodBlock(max_non_matches=len(self.a.columns)),
         Full(),
     ]
     self.assert_index_comparisons(eq, indexers, self.a, self.b)
new_child = new_child[['id',
                       'publish_date',
                       'title',
                       'content',
                       'related_parents',
                       'title_child_no_stop',
                       'content_child_no_stop',
                       'child_numbers',
                       'cbs_link']]

#---------------------------------------#
# Feature creation and model prediction #
#---------------------------------------#
# Indexation step
indexer = recordlinkage.Index()
indexer.add(Full())
candidate_links = indexer.index(parents, new_child)

# Comparison step - creation of all possible matches
compare_cl = recordlinkage.Compare()
compare_cl.string('link', 'cbs_link', method='jarowinkler', threshold=0.93, label='feature_link_score')
features = compare_cl.compute(candidate_links, parents, new_child)
features.reset_index(inplace=True)

# Add extra data of parents and new_child to feature table and rename conflicting columns
features.loc[:, 'child_id'] = features.apply(find_id, args=(new_child, 'level_1'), axis=1)
features.loc[:, 'parent_id'] = features.apply(find_id, args=(parents, 'level_0'), axis=1)
features = features.merge(parents, left_on='parent_id', right_on='id', how='left')
features = features.merge(new_child, left_on='child_id', right_on='id', how='left')
features.drop(columns=['level_0', 'level_1', 'id_x', 'id_y'], inplace=True)
features.rename(columns={'title_x': 'title_parent',
Example #14
0
def assign_postal_lat_lng(df):
    addresses = df['addr'].str.cat(df['city'], sep=', ')
    addresses_to_postal = [address_to_postal.get(a) for a in addresses]
    addresses_to_lat = [address_to_latlng[a][0] if a in address_to_latlng else None for a in addresses]
    addresses_to_lng = [address_to_latlng[a][1] if a in address_to_latlng else None for a in addresses]

    return df.assign(postal=addresses_to_postal, lat=addresses_to_lat, lng=addresses_to_lng)

df = assign_postal_lat_lng(df)
df.head(6)

import recordlinkage as rl
from recordlinkage.index import Full

full_indexer = Full()
pairs = full_indexer.index(df)

print(f"Full index: {len(df)} records, {len(pairs)} pairs")



from recordlinkage.index import Block

postal_indexer = Block('postal')
pairs = postal_indexer.index(df)

print(f"Postal index: {len(pairs)} pairs")

pairs.to_frame()[:10].values
class TestIndexAlgorithmApi(TestData):
    """General unittest for the indexing API."""
    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        from recordlinkage.index import Random

        if not isinstance(index_class, Random):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(index={self.a.index[1]: self.a.index[0]},
                             inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [('index1', 'index2'),
                       ('index1', None), (None, 'index2'), (None, None),
                       (10, 'index2'), (10, 11)]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(Full()),
        param(Block(on='var_arange')),
        param(SortedNeighbourhood(on='var_arange')),
        param(Random(10, random_state=100, replace=True)),
        param(Random(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))
import os
import unittest
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage
from recordlinkage.index import Full, Block, SortedNeighbourhood, Random

TEST_INDEXATION_OBJECTS = [
    param(Full()),
    param(Block(on='var_arange')),
    param(SortedNeighbourhood(on='var_arange')),
    param(Random(10, random_state=100, replace=True)),
    param(Random(10, random_state=100, replace=False))
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150