def linkDB(df1, df2, type, classifier): # 1 - INDEXING indexer = recordlinkage.Index() if type == "sortedneighbourhood": indexer.sortedneighbourhood(left_on="0_restaurant", right_on="1_restaurant") elif type == "full": indexer.full() elif type == "block": indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle") candidate_links = indexer.index(df1, df2) test_pairs = candidate_links[0:100] #https://recordlinkage.readthedocs.io/en/latest/annotation.html """ df1.columns = df1.columns.str.replace(r'0_', '') df2.columns = df2.columns.str.replace(r'1_', '') recordlinkage.write_annotation_file( "check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF") df1 = df1.add_prefix('0_') df2 = df2.add_prefix('1_') """ annotations = recordlinkage.read_annotation_file('result.json') # 2 - COMPARISON comp = recordlinkage.Compare() comp.string('0_restaurant', '1_restaurant', threshold=0.95, method='jarowinkler', label='ristorante') comp.string('0_neighborhood', '1_neighborhood', method='jarowinkler', threshold=0.85, label='quartiere') comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle') features = comp.compute(candidate_links, df1, df2) test_features = comp.compute(test_pairs, df1, df2) # 3 - CLASSIFICATION # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised matches = [] drop1 = [] drop2 = [] if classifier == "ecm": ecm = recordlinkage.ECMClassifier(init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features, match_index=None) # Train the classifier e_matches = ecm.predict(features) for i, j in e_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) k_matches = kmeans.predict(features) for i, j in k_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) head = tuple(df1.head()) + tuple(df2.head()) matches_result = pd.DataFrame(matches) matches_result.columns = head df1t = df1.drop(drop1, axis=0) df2t = df2.drop(drop2, axis=0) result = df1t.append([df2t, matches_result]) new_index = [] for n in range(result.shape[0]): new_index.append(n) result.index = new_index # 4 - EVALUATION if classifier == "ecm": test_matches = ecm.predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) elif classifier == "kmeans": test_matches = kmeans.fit_predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) print(cm, acc) return result
def firstDFgenerator(): fb = pd.read_csv("./data/restaurants/gbr_splitted_google/FoodBuzz.csv", sep=";") fb_cols = [ 'restaurant', 'neighborhood', 'address', 'addressGoogle', 'country', 'country_code', 'type', 'cost' ] fb = fb[fb_cols] nym = pd.read_csv("./data/restaurants/gbr_splitted_google/NYMag.csv", sep=";") nym.rename(columns={"address": "neighborhood"}, inplace=True) nyt = pd.read_csv("./data/restaurants/gbr_splitted_google/NYTimes.csv", sep=";") ot = pd.read_csv("./data/restaurants/gbr_splitted_google/OpenTable.csv", sep=";") sc = pd.read_csv("./data/restaurants/gbr_splitted_google/SavoryCities.csv", sep=";") sc_cols = ['restaurant', 'neighborhood', 'address', 'addressGoogle'] sc = sc[sc_cols] df = fb.append([nym, nyt, ot, sc]) df.drop_duplicates(subset=["restaurant", "neighborhood"], keep="first", inplace=True) lenght = df.shape[0] new_index = [] for i in range(lenght): new_index.append(i) df.index = new_index # DEDUPLICATION indexer = recordlinkage.Index() indexer.sortedneighbourhood(left_on=['restaurant']) candidate_links = indexer.index(df) compare_cl = recordlinkage.Compare() compare_cl.string('restaurant', 'restaurant', method='jarowinkler', label='ristorante') compare_cl.string('neighborhood', 'neighborhood', method='jarowinkler', label='quartiere') features = compare_cl.compute(candidate_links, df) potential_matches = features[features.sum(axis=1) > 1.75].reset_index() potential_matches[ 'Score'] = potential_matches.loc[:, ['ristorante', 'quartiere']].sum( axis=1) # MATCHES account_merge = potential_matches.merge(df, left_on="level_0", right_index=True) final_merge = account_merge.merge(df, left_on="level_1", right_index=True) final_merge.dropna(axis='columns', inplace=True, how='all') # rimuovo solo colonne tutte NaN final_merge.drop(columns=['ristorante', 'quartiere', 'Score'], inplace=True) final_list = [] all_indexes = [] for i, row in final_merge.iterrows(): all_indexes.append(row.level_0) all_indexes.append(row.level_1) restaurant = row.restaurant_x neighborhood = row.neighborhood_x # da controllare if row.addressGoogle_x == row.addressGoogle_y: addressGoogle = str(row.addressGoogle_x) elif (row.addressGoogle_x != row.addressGoogle_x): # è NaN addressGoogle = str(row.addressGoogle_y) else: addressGoogle = str(row.addressGoogle_x) address = row.address_y country = row.country_y type_r = row.type_y cost = row.cost_y country_code = row.country_code_y new_row = (restaurant, neighborhood, address, addressGoogle, country, country_code, type_r, cost) final_list.append(new_row) clean_merge = pd.DataFrame(final_list) clean_merge.columns = [ "restaurant", "neighborhood", "address", "addressGoogle", "country", "country_code", "type_r", "cost" ] df.drop(index=all_indexes, inplace=True, axis=0) df = df.append(clean_merge) lenght = df.shape[0] new_index = [] for i in range(lenght): new_index.append(i) df.index = new_index return df
str(false_pos)) print("#True Negative: " + str(true_neg) + " #FN " + str(false_neg)) torch.save(model.state_dict(), "checkpoint.pt") epoch += 1 if __name__ == "__main__": args = parse_arguments() device = 'cuda' if torch.cuda.is_available() else 'cpu' df1 = pd.read_csv(args.source1, args.separator) df2 = pd.read_csv(args.source2, args.separator) map = pd.read_csv(args.mapping, args.separator) indexer = recordlinkage.Index() indexer.sortedneighbourhood(args.blocking_attr, window=args.blocking_size) candidate_links = shuffle(indexer.index(df1, df2)) train_set = candidate_links[:int(3 * len(candidate_links) / 4)] test_set = candidate_links[int(3 * len(candidate_links) / 4):] model = NLP(args.word_embed, args.word_embed_size, args.n_attrs, device).to(device) print(model) if args.load_model != '': model.load_state_dict(torch.load(args.load_model)) train(model, df1, df2, map, train_set, test_set)
#df2 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',sheet_name='FallBookstoreList') df2.head() #%% #method 1 matched_results = fuzzymatcher.fuzzy_left_join(df1, df2, 'Title', 'Long Title') #%% matched_results[['best_match_score', 'Title', 'Long Title', 'Internal ID' ]].to_excel('fuzzymatcherresults_min_spring.xlsx') matched_results.to_excel('fuzzymatcherresults_full_spring.xlsx') # %% #method 2 import recordlinkage as rl from recordlinkage.index import Full # %% indexer = rl.Index() indexer.add(Full()) pairs = indexer.index( df1, df2, ) print(len(pairs)) comparer = rl.Compare() comparer.string('Title', 'Long Title', threshold=0.55, label='Title') potential_matches = comparer.compute(pairs, df1, df2) matches = potential_matches[potential_matches.sum(axis=1) > 0].reset_index() #print(matches)
def secondDFgenerator(): to = pd.read_csv("./data/restaurants/gbr_splitted_google/TimeOut.csv", sep=";") temp_to = to['in'].map(str) + ' ' + to['address1'].map( str) + '-' + to['address2'].map(str) + to['address3'].map(str) temp_to = temp_to.str.replace("nan", "") to.drop(columns=['in', 'address1', 'address2', 'address3'], inplace=True, axis=1) to['address'] = temp_to to['country'] = 'New York' to_cols = [ 'restaurant', 'address', 'addressGoogle', 'country', 'neighborhood', 'type', 'phone' ] to = to[to_cols] ad = pd.read_csv("./data/restaurants/gbr_splitted_google/ActiveDiner.csv", sep=";") ad_cols = ['restaurant', 'address', 'addressGoogle', 'country'] ad = ad[ad_cols] dg = pd.read_csv("./data/restaurants/gbr_splitted_google/DiningGuide.csv", sep=";") mp = pd.read_csv("./data/restaurants/gbr_splitted_google/MenuPages.csv", sep=";") temp_mp = mp['address1'].map(str) + '-' + mp['address2'].map(str) temp_mp = temp_mp.str.replace("nan", "") mp.drop(columns=['address1', 'address2'], inplace=True, axis=1) mp['address'] = temp_mp mp.rename(columns={"addressGoogle1": "addressGoogle"}, inplace=True) mp_cols = ['restaurant', 'address', 'addressGoogle'] mp = mp[mp_cols] ts = pd.read_csv("./data/restaurants/gbr_splitted_google/TasteSpace.csv", sep=";") ts_cols = ['restaurant', 'address', 'addressGoogle', 'country'] ts = ts[ts_cols] vv = pd.read_csv("./data/restaurants/gbr_splitted_google/VillageVoice.csv", sep=";") vv['country'] = 'New York' vv_cols = [ 'restaurant', 'address', 'addressGoogle', 'country', 'neighborhood' ] vv = vv[vv_cols] df = to.append([ad, dg, mp, ts, vv]) df.drop_duplicates(subset=["restaurant", "addressGoogle"], keep="first", inplace=True) lenght = df.shape[0] new_index = [] for i in range(lenght): new_index.append(i) df.index = new_index # DEDUPLICATION indexer = recordlinkage.Index() indexer.sortedneighbourhood(left_on=['restaurant']) candidate_links = indexer.index(df) compare_cl = recordlinkage.Compare() compare_cl.string('restaurant', 'restaurant', method='jarowinkler', label='ristorante') compare_cl.exact('addressGoogle', 'addressGoogle', label='indirizzo') features = compare_cl.compute(candidate_links, df) potential_matches = features[features.sum(axis=1) > 1.75].reset_index() potential_matches[ 'Score'] = potential_matches.loc[:, ['ristorante', 'indirizzo']].sum( axis=1) # MATCHES account_merge = potential_matches.merge(df, left_on="level_0", right_index=True) #, how='outer' final_merge = account_merge.merge(df, left_on="level_1", right_index=True) final_merge.dropna(axis='columns', inplace=True, how='all') #rimuovo solo colonne tutte NaN final_merge.to_csv( './data/restaurants_integrated/output_recordlinkage/final_merge.csv', header=True, sep=";", decimal=',', float_format='%.3f', index=False) final_merge.drop(columns=['ristorante', 'indirizzo', 'Score'], inplace=True) # so che sono 1 1 2 final_list = [] all_indexes = [] for i, row in final_merge.iterrows(): all_indexes.append(row.level_0) all_indexes.append(row.level_1) restaurant = row.restaurant_x address = row.address_x country = "New York" if row.type_x == row.type_y: type_r = str(row.type_x) elif (row.type_x != row.type_x): type_r = str(row.type_y) else: type_r = str(row.type_x) if row.phone_x == row.phone_y: phone = str(row.phone_x) elif (row.phone_x != row.phone_x): phone = str(row.phone_y) else: phone = str(row.phone_x) if row.neighborhood_x == row.neighborhood_y: neighborhood = str(row.neighborhood_x) elif (row.neighborhood_x != row.neighborhood_x): neighborhood = str(row.neighborhood_y) else: neighborhood = str(row.neighborhood_x) if row.addressGoogle_x == row.addressGoogle_y: addressGoogle = str(row.addressGoogle_x) elif (row.addressGoogle_x != row.addressGoogle_x): # è NaN addressGoogle = str(row.addressGoogle_y) else: addressGoogle = str(row.addressGoogle_x) new_row = (restaurant, address, addressGoogle, country, neighborhood, type_r, phone) final_list.append(new_row) clean_merge = pd.DataFrame(final_list) clean_merge.columns = [ "restaurant", "address", "addressGoogle", "country", "neighborhood", "type_r", "phone" ] df.drop(index=all_indexes, inplace=True, axis=0) df = df.append(clean_merge) lenght = df.shape[0] new_index = [] for i in range(lenght): new_index.append(i) df.index = new_index return df
def main(): dfA, dfB = load_febrl4() indexer = recordlinkage.Index() indexer.full() pairs = indexer.index(dfA, dfB) return
def record_linkage(df_protect, df_init, block_left, block_right, gen_col): indexer = recordlinkage.Index() if block_left is None: indexer.full() if block_left is not None: indexer.block(left_on=block_left, right_on=block_right) candidates = indexer.index(df_protect, df_init) # print(len(candidates)) compare = recordlinkage.Compare() if all(df_protect[gen_col].dtypes == object): compare.string('height', 'height', threshold=0.9, label='height') compare.string('weight', 'weight', threshold=0.9, label='weight') # compare.string('born', 'born', threshold=0.9, label='born') compare.string('Year', 'Year', threshold=0.9, label='Year') compare.string('Age', 'Age', threshold=0.9, label='Age') compare.string('NBA_Years', 'NBA_Years', threshold=0.9, label='NBA_Years') if block_left == 'birth_city': compare.string('born', 'born', threshold=0.9, label='born') if not all(df_protect[gen_col].dtypes == object): compare.numeric('height', 'height', label='height') compare.numeric('weight', 'weight', label='weight') compare.numeric('Year', 'Year', label='Year') compare.numeric('Age', 'Age', label='Age') compare.numeric('NBA_Years', 'NBA_Years', label='NBA_Years') if block_left == 'born': compare.string('birth_city', 'birth_city', threshold=0.9, label='birth_city') if all(df_protect[['year_start', 'year_end']].dtypes == float): compare.numeric('year_start', 'year_start', label='year_start') compare.numeric('year_end', 'year_end', label='year_end') if not all(df_protect[['year_start', 'year_end']].dtypes == float): compare.string('year_start', 'year_start', threshold=0.9, label='year_start') compare.string('year_end', 'year_end', threshold=0.9, label='year_end') compare.string('college', 'college', threshold=0.9, label='college') compare.string('birth_state', 'birth_state', threshold=0.9, label='birth_state') compare.numeric('PER', 'PER', label='PER') compare.numeric('G', 'G', label='G') compare.numeric('GS', 'GS', label='GS') compare.numeric('MP', 'MP', label='MP') compare.numeric('ORB%', 'ORB%', label='ORB%') compare.numeric('DRB%', 'DRB%', label='DRB%') compare.numeric('TRB%', 'TRB%', label='TRB%') compare.numeric('ORB', 'ORB', label='ORB') compare.numeric('DRB', 'DRB', label='DRB') compare.numeric('TRB', 'TRB', label='TRB') compare.numeric('PTS', 'PTS', label='PTS') return compare.compute(candidates, df_protect, df_init)
def init(self): e = xml.etree.ElementTree.parse(config.CORA_XML_PATH).getroot() logger.info("Sample Record from CORA dataset: ") logger.info(xml.etree.ElementTree.tostring(e.find('NEWREFERENCE'))) self.data = {} dataA = [] dataB = [] testA = [] testB = [] valA = [] valB =[] db_choice = 0 for record in e.findall('NEWREFERENCE'): dni = re.search('[a-z]+[0-9]+[a-z]*', record.text) dni = re.search('[a-z]+', record.text) if not dni else dni dni = dni.group() if dni else record.text if dni in self.data: self.data[dni].append(record) else: self.data[dni] = [record] #db_choice = random.randint(0, 3) if db_choice == 0 or db_choice == 4 or db_choice == 8: dataA.append(record) elif db_choice == 1 or db_choice == 5 or db_choice == 9: dataB.append(record) elif db_choice == 2 or db_choice == 6: testA.append(record) elif db_choice == 3 or db_choice == 7: testB.append(record) elif db_choice == 10: valA.append(record) elif db_choice == 11: valB.append(record) db_choice = (db_choice + 1) % 12 logger.info("Size of Training Dataset A %d and B %d", len(dataA), len(dataB)) logger.info("Size of Testing Dataset A %d and B %d", len(testA), len(testB)) logger.info("Size of Validation Dataset A %d and B %d", len(valA), len(valB)) df_a = { 'dni' : [], 'author' : [], 'publisher' : [], 'date' : [], 'title' : [], 'journal' : [], 'volume' : [], 'pages' : [], 'address' : [], 'id' : []} df_b = copy.deepcopy(df_a) tdf_a = copy.deepcopy(df_a) tdf_b = copy.deepcopy(df_a) vdf_a = copy.deepcopy(df_a) vdf_b = copy.deepcopy(df_a) for (df, dataset) in [(df_a, dataA), (df_b, dataB), (tdf_a, testA), (tdf_b, testB), (vdf_a, valA), (vdf_b, valB)]: for record in dataset: dni = re.search('[a-z]+[0-9]+[a-z]*', record.text) dni = re.search('[a-z]+', record.text) if not dni else dni df['dni'].append(dni.group() if dni else record.text) df['author'].append(unicode(record.find('author').text if record.find('author') is not None else '','utf-8')) df['title'].append(unicode(record.find('title').text if record.find('title') is not None else u'')) df['publisher'].append(unicode(record.find('publisher').text if record.find('publisher') is not None else '')) df['date'].append(unicode(record.find('date').text if record.find('date') is not None else '')) df['journal'].append(unicode(record.find('journal').text if record.find('journal') is not None else '')) df['volume'].append(unicode(record.find('volume').text if record.find('volume') is not None else '')) df['pages'].append(unicode(record.find('pages').text if record.find('pages') is not None else '')) df['address'].append(unicode(record.find('address').text if record.find('address') is not None else '')) df['id'].append(record.get('id')) self.trainDataA = pd.DataFrame(data=df_a) self.trainDataB = pd.DataFrame(data=df_b) self.testDataA = pd.DataFrame(data=tdf_a) self.testDataB = pd.DataFrame(data=tdf_b) self.valDataA = pd.DataFrame(data=vdf_a) self.valDataB = pd.DataFrame(data=vdf_b) #Extract all possible pairs for training indexer = recordlinkage.Index() indexer.full() self.candidate_links = indexer.index(self.trainDataA, self.trainDataB) logger.info("No. of Candidate Pairs %d", (len(self.candidate_links))) #Extarct true links (takes time...) true_links = [] for indexA, indexB in self.candidate_links: if df_a['dni'][indexA] == df_b['dni'][indexB] and df_a['dni'][indexA]: true_links.append((indexA, indexB)) logger.info("Number of true links: %d", len(true_links)) self.true_links = pd.MultiIndex.from_tuples(true_links) #Extract all possible pairs for test indexer = recordlinkage.Index() indexer.full() self.test_links = indexer.index(self.testDataA, self.testDataB) logger.info("Number Candidate Pairs for testing: %d", (len(self.test_links))) #Extarct test true links (takes time...) true_test_links = [] for indexA, indexB in self.test_links: if tdf_a['dni'][indexA] == tdf_b['dni'][indexB]: true_test_links.append((indexA, indexB)) logger.info("Number of true links for testing: %d", len(true_test_links)) self.true_test_links = pd.MultiIndex.from_tuples(true_test_links) #Extract all possible pairs for Validation indexer = recordlinkage.Index() indexer.full() self.val_links = indexer.index(self.valDataA, self.valDataB) logger.info("Number Candidate Pairs for validation: %d", (len(self.val_links))) #Extarct test true links (takes time...) true_val_links = [] for indexA, indexB in self.val_links: if vdf_a['dni'][indexA] == vdf_b['dni'][indexB]: true_val_links.append((indexA, indexB)) logger.info("Number of true links for validation: %d", len(true_val_links)) self.true_val_links = pd.MultiIndex.from_tuples(true_val_links)
def incremental_match(df_n, df_o): indexer = rl.Index() indexer.block(left_on=[ 'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean' ]) indexer.block(left_on=['first_name_clean', 'last_name_clean']) indexer.block(left_on=['zip_clean', 'first_name_clean']) indexer.block(left_on=['zip_clean', 'last_name_clean']) indexer.block(left_on=['zip_clean', 'state_clean']) indexer.block(left_on=['last_name_clean', 'state_clean']) indexer.block(left_on=['zip_clean', 'phone_clean']) candidate_links = indexer.index(df_n, df_o) compare_cl = rl.Compare(n_jobs=8) compare_cl.string('first_name_clean', 'first_name_clean', method='jarowinkler', threshold=0.92, label='first_name_cl') compare_cl.string('last_name_clean', 'last_name_clean', method='jarowinkler', threshold=0.85, label='last_name_cl') compare_cl.string('email', 'email', method='jarowinkler', threshold=0.90, label='email_cl') compare_cl.string('address_clean', 'address_clean', method='jarowinkler', threshold=0.70, label='address_cl') compare_cl.string('city_clean', 'city_clean', method='jarowinkler', threshold=0.85, label='city_cl') compare_cl.string('state_clean', 'state_clean', method='jarowinkler', threshold=0.85, label='state_cl') compare_cl.exact('zip_clean', 'zip_clean', label='zip_cl') compare_cl.string('phone_clean', 'phone_clean', method='jarowinkler', threshold=0.95, label='phone_cl') # create the deduped feature set - this takes a while... cl_features = compare_cl.compute(candidate_links, df_n, df_o) cl_summary = cl_features.sum(axis=1).value_counts().sort_index( ascending=False) df_match = cl_features[cl_features.sum(axis=1) > 2].reset_index( ) #this limits the dataset to rows that had more than 2 matched columns df_match['score'] = df_match.loc[:, 'email_cl':'phone_cl'].sum( axis=1) #this sums the columns and creates a "score" column #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no) df_match['duplicate'] = np.where( (df_match['email_cl'] == 1) | (df_match['first_name_cl'] == 1 | (df_match['last_name_cl'] == 1)) & ((df_match['score'] > 3) | (df_match['phone_cl'] + df_match['zip_cl'] == 2)), 1, 0) ne = df_match[df_match['duplicate'] == 1] #filter out non-matching rows ne.sort_values(by=['score'], ascending=False) #sort the results by the score column return ne
def match_data_chunk(df, s): count_pass = 0 start = time.process_time() all_matches = [] df0 = df[df['email'].isnull( ) | ~df[df['email'].notnull()].duplicated(subset='email', keep=False)] mask = df.email.duplicated(keep=False) d = df[mask] dupe_emails = d[['customer_id', 'email']].dropna(subset=['email']) df1 = df0.sample(frac=s, replace=False, random_state=42) df1.set_index('customer_id', inplace=True) dupe_indexer = rl.Index() #set the blocking system dupe_indexer.block(left_on=[ 'first_name_clean', 'last_name_clean', 'zip_clean', 'state_clean' ]) dupe_indexer.block(left_on=['first_name_clean', 'last_name_clean']) dupe_indexer.block(left_on=['zip_clean', 'first_name_clean']) dupe_indexer.block(left_on=['zip_clean', 'last_name_clean']) #dupe_indexer.block(left_on=['zip_clean', 'state_clean']) dupe_indexer.block(left_on=['last_name_clean', 'state_clean']) #dupe_indexer.block(left_on=['zip_clean', 'phone_clean']) dupe_candidate_links = dupe_indexer.index(df1) print("total candidate links:", len(dupe_candidate_links)) # split the pair indedx for iteration s = rl.index_split(dupe_candidate_links, 20) for chunk in s: compare_dupes = rl.Compare(n_jobs=8) compare_dupes.string('first_name_clean', 'first_name_clean', method='jarowinkler', threshold=0.92, label='first_name_cl') compare_dupes.string('last_name_clean', 'last_name_clean', method='jarowinkler', threshold=0.85, label='last_name_cl') compare_dupes.string('email', 'email', method='jarowinkler', threshold=0.90, label='email_cl') compare_dupes.string('address_clean', 'address_clean', method='jarowinkler', threshold=0.7, label='address_cl') compare_dupes.string('city_clean', 'city_clean', method='jarowinkler', threshold=0.85, label='city_cl') compare_dupes.string('state_clean', 'state_clean', method='jarowinkler', threshold=0.85, label='state_cl') compare_dupes.exact('zip_clean', 'zip_clean', label='zip_cl') compare_dupes.string('phone_clean', 'phone_clean', method='jarowinkler', threshold=0.95, label='phone_cl') # create the deduped feature set - this takes a while... dupe_features = compare_dupes.compute(chunk, df1) # select those features that match # Business rule: of any 3 of email, address, city, state, zip, or phone match, then code it as a "duplicate" pdm = dupe_features[dupe_features.sum(axis=1) > 2].reset_index( ) #this limits the dataset to rows that had more than 2 matched columns pdm['score'] = pdm.loc[:, 'email_cl':'phone_cl'].sum( axis=1) #this sums the columns and creates a "score" column pdm['duplicate'] = np.where( (pdm['email_cl'] == 1) | (pdm['first_name_cl'] == 1 | (pdm['last_name_cl'] == 1)) & ((pdm['score'] > 3) | (pdm['phone_cl'] + pdm['zip_cl'] == 2)), 1, 0 ) #creates an indicator based on the threshold rule of > 2 matches (1=yes, 0=no) ne = pdm[pdm['duplicate'] == 1] #filter out non-matching rows ne.sort_values(by=['score'], ascending=False) #sort the results by the score column matches = ne[[ 'customer_id_1', 'customer_id_2', 'email_cl', 'phone_cl', 'address_cl', 'city_cl', 'state_cl', 'zip_cl', 'score', 'duplicate' ]].sort_values(by=['score'], ascending=False) all_matches.append(matches) count_pass = count_pass + 1 elapsed = time.process_time() - start print(count_pass, format(elapsed, '.3f'), ": seconds") all_matches = pd.concat(all_matches, ignore_index=True) return df1, all_matches, dupe_emails
def eval_lightgbm_dedupe(dataset_type: str, dataset_rf, dataset_dedupe): # Create deduper model with open( '../../trained_models/combined_2/trained_{}_settings.json'.format( dataset_type), 'rb') as fin: deduper = dedupe.StaticDedupe(fin, num_cores=NUM_CORES) cols = ['name', 'brand', 'size', 'product_type'] to_dedupe = dataset_dedupe[cols] to_dedupe_dict = to_dedupe.to_dict(orient='index') # Cluster (prediction stage) clustered_dupes = deduper.partition(to_dedupe_dict, partition_threshold[dataset_type]) print('# duplicate sets', len(clustered_dupes)) # Create the record linkage model # Indexer indexer = rl.Index() indexer.add(rl.index.Block('product_type')) indexer.add(rl.index.Block('brand')) indexer.add(rl.index.Block('size')) candidate_links = indexer.index(dataset_rf) # Comparing compare_cl = rl.Compare(n_jobs=-1) compare_cl.exact('brand', 'brand') compare_cl.exact('product_type', 'product_type') compare_cl.exact('size', 'size') compare_cl.string('name', 'name', method='qgram') compare_cl.string('name', 'name', method='damerau_levenshtein') compare_cl.string('name', 'name', method='levenshtein') compare_cl.string('name', 'name', method='jarowinkler') compare_cl.string('name', 'name', method='smith_waterman') compare_cl.string('name', 'name', method='lcs') # compare_cl.string('price', 'price') # Features features = compare_cl.compute(candidate_links, dataset_rf) # Add dedupe features features['xy_same_entity'] = pd.Series(np.zeros(len(features))) features.xy_same_entity = 0.0 # Save the result for el in clustered_dupes: for i in range(len(el[0])): for j in range(i + 1, len(el[0])): k = (el[0][i], el[0][j]) r_k = (el[0][j], el[0][i]) p = el[1][i] * el[1][j] if k in features.index: features.loc[k, 'xy_same_entity'] = p if r_k in features.index: features.loc[r_k, 'xy_same_entity'] = p # Now load the lightgbm bst = lgb.Booster( model_file='../../trained_models/combined_2/x4_lgb_classifier.txt') # Predict confs = bst.predict(features) features['label'] = confs # Save the csv file # Now export the left and right instance ids # Save the result res = [] for i in range(len(confs)): record = features.iloc[i] label = record.label if label > 0.5: res.append(record.name) res_df = pd.DataFrame(res) res_df.columns = ['left_instance_id', 'right_instance_id'] return res_df
def detect_duplicates(df_in): ''' Function used to detected and drop duplicates due to fuzzy data (e.g. due to typos) in a df_patient dataframe. Parameters ---------- df_in : Dataframe df_patient dataframe to clean from duplicates We assume that the input dataframe has the following columns: given_name, surname, street_number, address_1, suburb, postcode, state, date_of_birth, age, phone_number, address_2 Returns ------- df : Dataframe The clean df_patient dataframe with the duplicates rows removed ''' # Copy the the input Dataframe df = df_in.copy() # Indexation step # The index is built from some columns to make possible pairs of duplicates # We choose the columns: # - address_1: because it has potentially the more characters # - phone_number: it has only digits # - surname dupe_indexer = recordlinkage.Index() dupe_indexer.sortedneighbourhood(left_on='address_1', window=3) dupe_indexer.sortedneighbourhood(left_on='phone_number', window=3) dupe_indexer.sortedneighbourhood(left_on='surname', window=3) dupe_candidate_links = dupe_indexer.index(df) # Comparison step compare_dupes = recordlinkage.Compare() # We added below all the columns that we want to compare for duplicates (all of them except patient_id) compare_dupes.string('given_name', 'given_name', method='jarowinkler', threshold=0.85, label='given_name') compare_dupes.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname') compare_dupes.exact('street_number', 'street_number', label='street_number') compare_dupes.string('address_1', 'address_1', threshold=0.85, label='address_1') compare_dupes.string('suburb', 'suburb', threshold=0.85, label='suburb') compare_dupes.exact('postcode', 'postcode', label='postcode') compare_dupes.exact('state', 'state', label='state') compare_dupes.exact('date_of_birth', 'date_of_birth', label='date_of_birth') compare_dupes.exact('age', 'age', label='age') compare_dupes.string('phone_number', 'phone_number', threshold=0.85, label='phone_number') compare_dupes.exact('address_2', 'address_2', label='address_2') dupes_features = compare_dupes.compute(dupe_candidate_links, df) # dupes_features is a dataframe with 2 row index that represent a pair of row (row 1, row2), # and the same number of column as df. each value is a score 0. or 1. # 0 for one column means the row1 and row2 values for this column are not duplicates # 1 means duplicates # We setup the threshold score for the potential dupes rows at 5.0: If 5 or more columns values are duplicates then full row is duplicate # Items with score > 4 are duplicates # potential_dupes contains all the row pairs above the threshold plus and additional score column potential_dupes = dupes_features[dupes_features.sum( axis=1) > 4].reset_index() potential_dupes[ 'Score'] = potential_dupes.loc[:, 'given_name':'address_2'].sum(axis=1) # As we ran reset_index() potential_dupes df has now additionnal columns level_0 and level_1 # Level_1 column contains the index of the duplicates to # remove. We do that with the instruction below. df = df.drop(axis=0, index=potential_dupes['level_1']).reset_index(drop=True) # compute percentage, then print percentage and number of rows removed print("Percentage of duplicate rows: ", 100 * (len(df_in) - len(df)) / len(df_in)) print("Number of rows removed: ", len(df_in) - len(df)) return df