コード例 #1
0
def test_multiindex_split():

    index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)])
    result = index_split(index, 3)

    assert len(result) == 3

    for i, result_index_chunk in enumerate(result):
        expected_index_chunk = index[i * 10:(i + 1) * 10]
        ptm.assert_index_equal(result_index_chunk, expected_index_chunk)

        assert len(result_index_chunk.levels) == 2
        assert len(result_index_chunk.labels) == 2
コード例 #2
0
ファイル: test_misc.py プロジェクト: cesar530/recordlinkage
def test_multiindex_split():

    index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)])
    result = index_split(index, 3)

    assert len(result) == 3

    for i, result_index_chunk in enumerate(result):
        expected_index_chunk = index[i * 10:(i + 1) * 10]
        pdt.assert_index_equal(result_index_chunk, expected_index_chunk)

        assert len(result_index_chunk.levels) == 2
        if is_min_pandas_version("0.24.0"):
            assert len(result_index_chunk.codes) == 2
        else:
            assert len(result_index_chunk.labels) == 2
コード例 #3
0
def match_data_chunk(df, s):
    count_pass = 0
    start = time.process_time()
    all_matches = []
    df0 = df[df['email'].isnull(
    ) | ~df[df['email'].notnull()].duplicated(subset='email', keep=False)]
    mask = df.email.duplicated(keep=False)
    d = df[mask]
    dupe_emails = d[['customer_id', 'email']].dropna(subset=['email'])
    df1 = df0.sample(frac=s, replace=False, random_state=42)
    df1.set_index('customer_id', inplace=True)
    dupe_indexer = rl.Index()

    #set the blocking system
    dupe_indexer.block(left_on=[
        'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean'
    ])
    dupe_indexer.block(left_on=['first_name_clean', 'last_name_clean'])
    dupe_indexer.block(left_on=['zip_clean', 'first_name_clean'])
    dupe_indexer.block(left_on=['zip_clean', 'last_name_clean'])
    #dupe_indexer.block(left_on=['zip_clean', 'state_clean'])
    dupe_indexer.block(left_on=['last_name_clean', 'state_clean'])
    #dupe_indexer.block(left_on=['zip_clean', 'phone_clean'])
    dupe_candidate_links = dupe_indexer.index(df1)
    print("total candidate links:", len(dupe_candidate_links))

    # split the pair indedx for iteration
    s = rl.index_split(dupe_candidate_links, 20)
    for chunk in s:
        compare_dupes = rl.Compare(n_jobs=8)
        compare_dupes.string('first_name_clean',
                             'first_name_clean',
                             method='jarowinkler',
                             threshold=0.92,
                             label='first_name_cl')
        compare_dupes.string('last_name_clean',
                             'last_name_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='last_name_cl')
        compare_dupes.string('email',
                             'email',
                             method='jarowinkler',
                             threshold=0.90,
                             label='email_cl')
        compare_dupes.string('address_clean',
                             'address_clean',
                             method='jarowinkler',
                             threshold=0.7,
                             label='address_cl')
        compare_dupes.string('city_clean',
                             'city_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='city_cl')
        compare_dupes.string('state_clean',
                             'state_clean',
                             method='jarowinkler',
                             threshold=0.85,
                             label='state_cl')
        compare_dupes.exact('zip_clean', 'zip_clean', label='zip_cl')
        compare_dupes.string('phone_clean',
                             'phone_clean',
                             method='jarowinkler',
                             threshold=0.95,
                             label='phone_cl')

        # create the deduped feature set - this takes a while...
        dupe_features = compare_dupes.compute(chunk, df1)

        # select those features that match

        # Business rule: of any 3 of email, address, city, state, zip, or phone match, then code it as a "duplicate"

        pdm = dupe_features[dupe_features.sum(axis=1) > 2].reset_index(
        )  #this limits the dataset to rows that had more than 2 matched columns
        pdm['score'] = pdm.loc[:, 'email_cl':'phone_cl'].sum(
            axis=1)  #this sums the columns and creates a "score" column
        pdm['duplicate'] = np.where(
            (pdm['email_cl'] == 1) | (pdm['first_name_cl'] == 1 |
                                      (pdm['last_name_cl'] == 1)) &
            ((pdm['score'] > 3) | (pdm['phone_cl'] + pdm['zip_cl'] == 2)), 1, 0
        )  #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no)

        ne = pdm[pdm['duplicate'] == 1]  #filter out non-matching rows
        ne.sort_values(by=['score'],
                       ascending=False)  #sort the results by the score column

        matches = ne[[
            'customer_id_1', 'customer_id_2', 'email_cl', 'phone_cl',
            'address_cl', 'city_cl', 'state_cl', 'zip_cl', 'score', 'duplicate'
        ]].sort_values(by=['score'], ascending=False)
        all_matches.append(matches)
        count_pass = count_pass + 1
        elapsed = time.process_time() - start
        print(count_pass, format(elapsed, '.3f'), ": seconds")
    all_matches = pd.concat(all_matches, ignore_index=True)

    return df1, all_matches, dupe_emails