Esempio n. 1
0
    def test_random_index(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index block
        index_cl = rl.Pairs(A, B)
        pairs = index_cl.random(20)

        # Check if index is unique
        self.assertTrue(pairs.is_unique)

        # Check is number of pairs is correct
        self.assertEqual(len(pairs), 20)

        # index block
        index_cl = rl.Pairs(A, B)
        pairs = index_cl.random(5)

        # Check if index is unique
        self.assertTrue(pairs.is_unique)

        # Check is number of pairs is correct
        self.assertEqual(len(pairs), 5)
Esempio n. 2
0
    def time_block_index(self):

        # setup class
        c_pairs = rl.Pairs(self.A)

        # Make pairs
        c_pairs.block('given_name')
Esempio n. 3
0
    def test_index_unique_dedup(self):

        index_A = pd.Index(self.bad_index_a, name='left')
        A = pd.DataFrame(self.A, index=index_A)

        with self.assertRaises(IndexError):
            recordlinkage.Pairs(A)
Esempio n. 4
0
    def time_full_index(self):

        # setup class
        c_pairs = rl.Pairs(self.A)

        # Make pairs
        c_pairs.full()
Esempio n. 5
0
    def time_sni_index(self):

        # setup class
        c_pairs = rl.Pairs(self.A)

        # Make pairs
        c_pairs.sortedneighbourhood('given_name', 5)
Esempio n. 6
0
    def time_random_index(self):

        # setup class
        c_pairs = rl.Pairs(self.A)

        # Make pairs
        c_pairs.random(2500)
Esempio n. 7
0
    def test_random_index_errors(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index block
        index_cl = rl.Pairs(A, B)

        # Check if index is unique
        with self.assertRaises(ValueError):
            index_cl.random(-10)

        # Check if index is unique
        with self.assertRaises(ValueError):
            index_cl.random(0)

        # Check if index is unique
        with self.assertRaises(ValueError):
            index_cl.random(5.5)

        # Check if index is unique
        with self.assertRaises(ValueError):
            index_cl.random(5.0)

        # Check if index is unique
        with self.assertRaises(ValueError):
            index_cl.random('str')
Esempio n. 8
0
    def test_instance_dedup(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(self.A)

        index_cl = recordlinkage.Pairs(df_A)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        self.assertIsInstance(pairs, pd.MultiIndex)
Esempio n. 9
0
    def test_amount_of_pairs_dedup(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(self.A)

        index_cl = recordlinkage.Pairs(df_A)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        # Test is the amount of pairs is less than A*B
        self.assertTrue(len(pairs) <= len(df_A) * (len(df_A) - 1) / 2)
Esempio n. 10
0
    def test_pairs_are_unique_linking(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(self.A)
        df_B = pd.DataFrame(self.B)

        index_cl = recordlinkage.Pairs(df_A, df_B)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        self.assertTrue(pairs.is_unique)
Esempio n. 11
0
    def test_empty_dataframes_dedup(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(columns=self.A.keys())

        if method_to_call != 'random':

            index_cl = recordlinkage.Pairs(df_A)
            pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

            self.assertIsInstance(pairs, pd.MultiIndex)
Esempio n. 12
0
    def test_index_unique_linking(self):

        index_A = pd.Index(self.bad_index_a, name='left')
        index_B = pd.Index(self.index_b, name='right')

        A = pd.DataFrame(self.A, index=index_A)
        B = pd.DataFrame(self.B, index=index_B)

        with self.assertRaises(IndexError):
            recordlinkage.Pairs(A, B)
Esempio n. 13
0
    def test_reduction_ratio_dedup(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(self.A)

        index_cl = recordlinkage.Pairs(df_A)
        getattr(index_cl, method_to_call)(*args, **kwargs)

        rr = index_cl.reduction

        self.assertTrue((rr >= 0.0) & (rr <= 1.0))
Esempio n. 14
0
def find_all_matches(first_df, second_df):
    index = recordlinkage.Pairs(first_df, second_df, chunks=500)

    for links in index.full():

        fill = recordlinkage.Compare(links, first_df, second_df)

        fill.string('first_name', 'first_name', method='jaro', threshold=0.72)
        fill.string('last_name', 'last_name', method='jaro', threshold=0.83)

    return get_df_from_vector(fill.vectors, first_df, second_df)
Esempio n. 15
0
    def test_index_names_linking(self, method_to_call, *args, **kwargs):
        """ Check modification of index name """

        df_A = pd.DataFrame(self.A)
        df_B = pd.DataFrame(self.B)

        index_cl = recordlinkage.Pairs(df_A, df_B)
        getattr(index_cl, method_to_call)(*args, **kwargs)

        # prevent that the index name is changed
        self.assertEqual(df_A.index.name, None)
        self.assertEqual(df_B.index.name, None)
Esempio n. 16
0
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # Add numbers (age)
        self.A['postcode'] = self.A['postcode'].astype(float)
        self.B['postcode'] = self.B['postcode'].astype(float)

        # make pairs
        c_pairs = rl.Pairs(self.A, self.B)
        self.pairs = c_pairs.full()[0:5e4]
Esempio n. 17
0
    def test_full_iter_index_dedup(self, method_to_call, *args, **kwargs):

        df_A = pd.DataFrame(self.A)

        index_cl = recordlinkage.Pairs(df_A, chunks=100)
        index = recordlinkage.Pairs(df_A)

        # Compute pairs in one iteration
        pairs_single = getattr(index, method_to_call)(*args, **kwargs)

        # Compute pairs in iterations
        n_pairs_iter = 0
        for pairs in getattr(index_cl, method_to_call)(*args, **kwargs):

            print(len(pairs))
            n_pairs_iter += len(pairs)

            # Check if index is unique
            self.assertTrue(pairs.is_unique)

        self.assertEqual(len(pairs_single), n_pairs_iter)
Esempio n. 18
0
    def test_index_names_not_none_dedup(self, method_to_call, *args, **kwargs):

        # setup A
        index_A = pd.Index(self.index_a, name='dedup')
        df_A = pd.DataFrame(self.A, index=index_A)

        # Make pairs
        index_cl = recordlinkage.Pairs(df_A)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        self.assertEqual(pairs.names, ['dedup', 'dedup'])
        self.assertEqual(df_A.index.name, 'dedup')
Esempio n. 19
0
def runSimpleTest2(find_address={'street': ['6 PROSPCT GARDNS EXTER',], 'postcode': ['EX4 6TA',]}):
    """
    Matching a given address against the database.
    Uses postcode blocking and Damerau - Levenshtein distance for the remaining address.
    Finds the best match by identifying the maximum of the sum of the comparison vectors.

    :param find_address:
    :type find_address: dict

    :return: None
    """
    print('Matching')
    pprint.pprint(find_address)

    # get data from the database against which we are linking
    df = getData()
    df['postcode'] = df.apply(data._getPostcode, axis=1)
    df['street'] = df.apply(data._removePostcode, axis=1)
    df.drop(['address', ], axis=1, inplace=True)

    # data frame of the one being linked
    find = pd.DataFrame(find_address)

    print('Start parsed matching with postcode blocking...')
    start = datetime.datetime.now()

    # set blocking
    pcl = recordlinkage.Pairs(df, find)
    pairs = pcl.block('postcode')
    print('\nAfter blocking, need to test', len(pairs))

    # compare the two data sets - use different metrics for the comparison
    compare = recordlinkage.Compare(pairs, df, find, batch=True)
    # compare.string('postcode', 'postcode', method='jarowinkler', threshold=0.95, name='postcode')
    # compare.string('street', 'street', method='damerau_levenshtein', threshold=0.85, name='street')
    compare.string('postcode', 'postcode', method='jarowinkler', name='postcode_jw')
    compare.string('street', 'street', method='damerau_levenshtein', name='street_dl')
    compare.run()

    # The comparison vectors
    print('\nComparison vectors:')
    print(compare.vectors)

    # find the matches and the best match
    matchmetrics = compare.vectors.sum(axis=1)
    potentialMatches = matchmetrics.index.levels[0].tolist()
    print('\nPotential Matches:')
    print(df.loc[potentialMatches])
    print('\nBest Match:')
    print(df.loc[matchmetrics.argmax()[0]])

    stop = datetime.datetime.now()
    print('\nRun in', round((stop - start).microseconds/1.e6, 2), 'seconds...')
Esempio n. 20
0
    def setup(self):

        # download data
        self.A = load_febrl1()

        # make pairs
        c_pairs = rl.Pairs(self.A)
        pairs = c_pairs.full()

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
Esempio n. 21
0
    def test_sni_index(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index full
        index_cl = rl.Pairs(A, B)
        pairs = index_cl.sortedneighbourhood('name')

        # Check if index is unique
        self.assertTrue(pairs.is_unique)

        # Check is number of pairs is correct
        self.assertTrue(len(pairs) <= len(A) * len(B))
Esempio n. 22
0
    def test_full_index(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index full
        index_cl = rl.Pairs(A, B)
        pairs = index_cl.full()

        # Check if index is unique
        self.assertTrue(pairs.is_unique)

        # Check is number of pairs is correct
        self.assertEqual(len(pairs), len(self.fullindex))
Esempio n. 23
0
    def test_qgram_index(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index block
        index_cl = rl.Pairs(A, B)
        pairs = index_cl.qgram('name')

        # Check if index is unique
        self.assertTrue(pairs.is_unique)

        # Check is number of pairs is correct
        self.assertTrue(len(pairs) <= len(A) * len(B))
Esempio n. 24
0
    def test_index_names_eq_linking(self, method_to_call, *args, **kwargs):

        # setup A
        index_A = pd.Index(self.index_a, name='leftright')
        df_A = pd.DataFrame(self.A, index=index_A)

        # setup B
        index_B = pd.Index(self.index_b, name='leftright')
        df_B = pd.DataFrame(self.B, index=index_B)

        # Make pairs
        index_cl = recordlinkage.Pairs(df_A, df_B)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        self.assertEqual(pairs.names, ['leftright', 'leftright'])
        self.assertEqual(df_A.index.name, 'leftright')
        self.assertEqual(df_B.index.name, 'leftright')
Esempio n. 25
0
    def test_index_names_none_linking(self, method_to_call, *args, **kwargs):

        # setup A
        index_A = pd.Index(self.index_a)
        df_A = pd.DataFrame(self.A, index=index_A)

        # setup B
        index_B = pd.Index(self.index_b)
        df_B = pd.DataFrame(self.B, index=index_B)

        # Make pairs
        index_cl = recordlinkage.Pairs(df_A, df_B)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        self.assertEqual(pairs.names, [None, None])
        self.assertEqual(df_A.index.name, None)
        self.assertEqual(df_B.index.name, None)
    def test_blocking_special_case_of_sorting(self):

        A = pd.DataFrame(self.data_A, index=pd.Index(
            self.index, name='index_a'))
        B = pd.DataFrame(self.data_B, index=pd.Index(
            self.index, name='index_b'))

        # index block
        index_cl = rl.Pairs(A, B)

        bl = index_cl.block('name')
        sn = index_cl.sortedneighbourhood('name', window=1)

        print('The number of record pairs found with blocking', len(bl))
        print("The number of record pairs found with sorted " +
              "neighbourhood indexing", len(sn))

        # The length of the union should be the same as the length of bl or sn.
        self.assertEqual(len(bl), len(sn))
Esempio n. 27
0
    def test_index_names_one_none_linking(self, method_to_call, *args,
                                          **kwargs):

        # setup A
        index_A = pd.Index(self.index_a, name=None)
        df_A = pd.DataFrame(self.A, index=pd.Index(index_A, name=None))

        # setup B
        index_B = pd.Index(self.index_b, name=None)
        df_B = pd.DataFrame(self.B, index=pd.Index(index_B, name='right'))

        # Make pairs
        index_cl = recordlinkage.Pairs(df_A, df_B)
        pairs = getattr(index_cl, method_to_call)(*args, **kwargs)

        print(pairs.names)
        print(df_A.index.name)

        self.assertEqual(pairs.names, [None, 'right'])
        self.assertEqual(df_A.index.name, None)
        self.assertEqual(df_B.index.name, 'right')
Esempio n. 28
0
    def test_sni_index_errors(self):

        A = pd.DataFrame(self.data_A,
                         index=pd.Index(self.index, name='index_a'))
        B = pd.DataFrame(self.data_B,
                         index=pd.Index(self.index, name='index_b'))

        # index full
        index_cl = rl.Pairs(A, B)

        with self.assertRaises(ValueError):
            index_cl.sortedneighbourhood('name', -3)

        with self.assertRaises(ValueError):
            index_cl.sortedneighbourhood('name', 2)

        with self.assertRaises(ValueError):
            index_cl.sortedneighbourhood('name', 'str')

        with self.assertRaises(ValueError):
            index_cl.sortedneighbourhood('name', 2.5)
Esempio n. 29
0
def testImpactOfNone(fill=False):
    """

    :return:
    """
    # create fake data
    toMatch = pd.DataFrame({
        'SubBuildingName': ['FLAT 1', None],
        'Postcode': ['AA', 'AA']
    })
    AddressBase = pd.DataFrame({
        'SubBuildingNameAB': ['FLAT 1', 'FLAT A', None],
        'PostcodeAB': ['AA', 'AA', 'AA']
    })

    if fill:
        msk = toMatch['SubBuildingName'].isnull()
        toMatch.loc[msk, 'SubBuildingName'] = 'NULL'
        msk = AddressBase['SubBuildingNameAB'].isnull()
        AddressBase.loc[msk, 'SubBuildingNameAB'] = 'NULL'

    # create pairs
    pcl = recordlinkage.Pairs(toMatch, AddressBase)

    # set blocking
    pairs = pcl.block(left_on=['Postcode'], right_on=['PostcodeAB'])
    print('Need to test', len(pairs), 'pairs for', len(toMatch.index),
          'addresses...')

    # compare the two data sets - use different metrics for the comparison
    compare = recordlinkage.Compare(pairs, AddressBase, toMatch, batch=True)
    compare.string('SubBuildingNameAB',
                   'SubBuildingName',
                   missing_value=0.1,
                   method='damerau_levenshtein',
                   name='test')
    compare.run()

    # add sum of the components to the comparison vectors dataframe
    compare.vectors['similarity_sum'] = compare.vectors.sum(axis=1)

    # find all matches where the metrics is above the chosen limit - small impact if choosing the best match
    matches = compare.vectors.loc[compare.vectors['similarity_sum'] > -1.]

    # sort matches by the sum of the vectors
    matches = matches.sort_values(by='similarity_sum', ascending=False)

    # reset indices
    matches = matches.reset_index()
    toMatch = toMatch.reset_index()
    AddressBase = AddressBase.reset_index()

    # join to the original
    data = pd.merge(matches,
                    toMatch,
                    how='left',
                    left_on='level_0',
                    right_on='index')
    data = pd.merge(data,
                    AddressBase,
                    how='left',
                    left_on='level_1',
                    right_on='index')
    print(data)
Esempio n. 30
0
B = pd.DataFrame(B) 
B['lastname'] = ['Smith', 'Clayton', 'Smith', 'Payne', 'Smith', 'Taylor']
B['age'] = [11, 34, 32, 21, 56, 72]
B['sex'] = ['M', 'M', 'M', 'F', 'F', 'F']
B['lat'] = [0.23, 10.54, 45.98, 23.43, 10.55, 65.21]
B['lon'] = [110.32, -66.98, 10.12, 45.45, -66.96, 32.04]
B.index.name = 'B' #notice that I add a name to the index, necessary for rl

print()
print(A)
print()
print(B)

#-------------------
#Record linkage

A = A #specify the dataframes to compare
B = B #specify the dataframes to compare

index = rl.Pairs(A, B)
candidate_links = index.block('lastname')

compare = rl.Compare(candidate_links, A, B)

'''
compare.string('name', 'name', method='jarowinkler', threshold=0.85)''' #I'm having issues with this attribute. 
compare.exact('sex', 'sex')
compare.exact('age', 'age')