def time_random_index(self): # setup class c_pairs = rl.RandomIndex(2500) # Make pairs c_pairs.index(self.A)
def test_random_seed(self): """Random: test seeding random algorithm""" # TEST IDENTICAL index_cl1 = recordlinkage.RandomIndex(n=1000, random_state=100) index_cl2 = recordlinkage.RandomIndex(n=1000, random_state=100) index_cl3 = recordlinkage.RandomIndex(n=1000, random_state=101) pairs1 = index_cl1.index((self.a, self.b)) pairs2 = index_cl2.index((self.a, self.b)) pairs3 = index_cl3.index((self.a, self.b)) # are pairs1 and pairs2 indentical? ptm.assert_index_equal(pairs1, pairs2) # are pairs1 and pairs3 not indentical? # numpy workaround self.assertFalse(np.array_equal(pairs1.values, pairs3.values))
def test_random_with_replace(self): """Random: test random indexing with replacement""" # situation 1: linking index_cl1 = recordlinkage.RandomIndex(n=1000, replace=True, random_state=100) pairs1 = index_cl1.index((self.a, self.b)) self.assertEqual(len(pairs1), 1000) self.assertFalse(pairs1.is_unique) # situation 2: dedup index_cl2 = recordlinkage.RandomIndex(n=1000, replace=True, random_state=101) pairs2 = index_cl2.index(self.a) self.assertEqual(len(pairs2), 1000) self.assertFalse(pairs2.is_unique)
import tempfile import shutil import pickle import numpy as np import pandas as pd import pandas.util.testing as ptm from parameterized import parameterized, param import recordlinkage TEST_INDEXATION_OBJECTS = [ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)), ] class TestData(unittest.TestCase): """Unittest object to setup test data.""" @classmethod def setUpClass(cls): n_a = 100 n_b = 150 cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)] cls.index_b = ['rec_b_%s' % i for i in range(0, n_b)]
class TestIndexApi(TestData): """General unittest for the indexing API.""" @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_repr(self, index_class): index_str = str(index_class) index_repr = repr(index_class) self.assertEqual(index_str, index_repr) start_str = '<{}'.format(index_class.__class__.__name__) self.assertTrue(index_str.startswith(start_str)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_arguments(self, index_class): """Test the index method arguments""" # The following should work index_class.index(self.a) index_class.index(self.a, self.b) index_class.index((self.a)) index_class.index([self.a]) index_class.index((self.a, self.b)) index_class.index([self.a, self.b]) index_class.index(x=(self.a, self.b)) def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = recordlinkage.FullIndex() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = recordlinkage.FullIndex() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it. @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_empty_imput_dataframes(self, index_class): """Empty DataFrames""" # make an empty dataframe with the columns of self.a and self.b df_a = pd.DataFrame(columns=self.a.columns.tolist()) df_b = pd.DataFrame(columns=self.b.columns.tolist()) if not isinstance(index_class, recordlinkage.RandomIndex): # make an index pairs = index_class.index((df_a, df_b)) # check if the MultiIndex has length 0 self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), 0) else: with self.assertRaises(ValueError): index_class.index((df_a, df_b)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_error_handling(self, index_class): """Test error handling on non-unique index.""" # make a non_unique index df_a = self.a.rename( index={self.a.index[1]: self.a.index[0]}, inplace=False) with self.assertRaises(ValueError): index_class.index(df_a) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_dedup(self, index_class): index_names = ['dedup', None, 'index', int(1)] expected = [ ['dedup_1', 'dedup_2'], [None, None], ['index_1', 'index_2'], ['1_1', '1_2'], ] for i, name in enumerate(index_names): index_A = pd.Index(self.a.index).rename(name) df_A = pd.DataFrame(self.a, index=index_A) pairs = index_class.index((df_A)) self.assertEqual(pairs.names, expected[i]) self.assertEqual(df_A.index.name, name) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_dedup(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) # make the index pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_link(self, index_class): # tuples with the name of the first and second index index_names = [ ('index1', 'index2'), ('index1', None), (None, 'index2'), (None, None), (10, 'index2'), (10, 11) ] for name_a, name_b in index_names: # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name=name_a) df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name=name_b) df_b = pd.DataFrame(self.b, index=index_b) pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, [name_a, name_b]) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, name_a) self.assertEqual(df_b.index.name, name_b) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_link(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_pickle(self, index_class): """Test if it is possible to pickle the class.""" pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle') # pickle before indexing pickle.dump(index_class, open(pickle_path, 'wb')) # compute the record pairs index_class.index(self.a, self.b) # pickle after indexing pickle.dump(index_class, open(pickle_path, 'wb'))