def test_multiindex_split(): index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)]) result = index_split(index, 3) assert len(result) == 3 for i, result_index_chunk in enumerate(result): expected_index_chunk = index[i * 10:(i + 1) * 10] ptm.assert_index_equal(result_index_chunk, expected_index_chunk) assert len(result_index_chunk.levels) == 2 assert len(result_index_chunk.labels) == 2
def test_multiindex_split(): index = pd.MultiIndex.from_product([np.arange(5), np.arange(6)]) result = index_split(index, 3) assert len(result) == 3 for i, result_index_chunk in enumerate(result): expected_index_chunk = index[i * 10:(i + 1) * 10] pdt.assert_index_equal(result_index_chunk, expected_index_chunk) assert len(result_index_chunk.levels) == 2 if is_min_pandas_version("0.24.0"): assert len(result_index_chunk.codes) == 2 else: assert len(result_index_chunk.labels) == 2
def match_data_chunk(df, s): count_pass = 0 start = time.process_time() all_matches = [] df0 = df[df['email'].isnull( ) | ~df[df['email'].notnull()].duplicated(subset='email', keep=False)] mask = df.email.duplicated(keep=False) d = df[mask] dupe_emails = d[['customer_id', 'email']].dropna(subset=['email']) df1 = df0.sample(frac=s, replace=False, random_state=42) df1.set_index('customer_id', inplace=True) dupe_indexer = rl.Index() #set the blocking system dupe_indexer.block(left_on=[ 'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean' ]) dupe_indexer.block(left_on=['first_name_clean', 'last_name_clean']) dupe_indexer.block(left_on=['zip_clean', 'first_name_clean']) dupe_indexer.block(left_on=['zip_clean', 'last_name_clean']) #dupe_indexer.block(left_on=['zip_clean', 'state_clean']) dupe_indexer.block(left_on=['last_name_clean', 'state_clean']) #dupe_indexer.block(left_on=['zip_clean', 'phone_clean']) dupe_candidate_links = dupe_indexer.index(df1) print("total candidate links:", len(dupe_candidate_links)) # split the pair indedx for iteration s = rl.index_split(dupe_candidate_links, 20) for chunk in s: compare_dupes = rl.Compare(n_jobs=8) compare_dupes.string('first_name_clean', 'first_name_clean', method='jarowinkler', threshold=0.92, label='first_name_cl') compare_dupes.string('last_name_clean', 'last_name_clean', method='jarowinkler', threshold=0.85, label='last_name_cl') compare_dupes.string('email', 'email', method='jarowinkler', threshold=0.90, label='email_cl') compare_dupes.string('address_clean', 'address_clean', method='jarowinkler', threshold=0.7, label='address_cl') compare_dupes.string('city_clean', 'city_clean', method='jarowinkler', threshold=0.85, label='city_cl') compare_dupes.string('state_clean', 'state_clean', method='jarowinkler', threshold=0.85, label='state_cl') compare_dupes.exact('zip_clean', 'zip_clean', label='zip_cl') compare_dupes.string('phone_clean', 'phone_clean', method='jarowinkler', threshold=0.95, label='phone_cl') # create the deduped feature set - this takes a while... dupe_features = compare_dupes.compute(chunk, df1) # select those features that match # Business rule: of any 3 of email, address, city, state, zip, or phone match, then code it as a "duplicate" pdm = dupe_features[dupe_features.sum(axis=1) > 2].reset_index( ) #this limits the dataset to rows that had more than 2 matched columns pdm['score'] = pdm.loc[:, 'email_cl':'phone_cl'].sum( axis=1) #this sums the columns and creates a "score" column pdm['duplicate'] = np.where( (pdm['email_cl'] == 1) | (pdm['first_name_cl'] == 1 | (pdm['last_name_cl'] == 1)) & ((pdm['score'] > 3) | (pdm['phone_cl'] + pdm['zip_cl'] == 2)), 1, 0 ) #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no) ne = pdm[pdm['duplicate'] == 1] #filter out non-matching rows ne.sort_values(by=['score'], ascending=False) #sort the results by the score column matches = ne[[ 'customer_id_1', 'customer_id_2', 'email_cl', 'phone_cl', 'address_cl', 'city_cl', 'state_cl', 'zip_cl', 'score', 'duplicate' ]].sort_values(by=['score'], ascending=False) all_matches.append(matches) count_pass = count_pass + 1 elapsed = time.process_time() - start print(count_pass, format(elapsed, '.3f'), ": seconds") all_matches = pd.concat(all_matches, ignore_index=True) return df1, all_matches, dupe_emails