def compute_all(A, B, idA, idB, keyA, keyB, merged, iters): ke = [] jkws = [] jk2g = [] jws = [] j2g = [] for _ in range(iters): start = time.time() ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-1) ke.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-1) jkws.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-1) jk2g.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-1) jws.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-1) j2g.append(time.time() - start) return ke, jkws, jk2g, jws, j2g
def apply_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, n_jobs=1): if self.sim_measure_type == 'JACCARD': return ssj.jaccard_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'COSINE': return ssj.cosine_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'DICE': return ssj.dice_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'EDIT_DISTANCE': return ssj.edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP': return ssj.overlap_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP_COEFFICIENT': return ssj.overlap_coefficient_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs)
def filter_person(self, df, lim): A = df.ENHANCEDPERSONS.dropna().apply(lambda x: list( set([y.split(',')[0].lower() for y in x.split(';')]))) A = A.apply(pd.Series).stack() A.index = A.index.map(lambda i: "{}_{}".format(i[0], i[1])) A = A.reset_index() A.columns = ['id', 'name'] B = self.fetch_person() B.name = B.name.str.replace(r'_+', ' ').str.lower() qg3_tok = sm.QgramTokenizer(qval=3) C = ssj.jaccard_join(A, B, 'id', 'id', 'name', 'name', qg3_tok, lim, l_out_attrs=['name'], r_out_attrs=['name'], show_progress=False) return set(C.l_id.apply(lambda x: int(x.split("_")[0])))
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim)
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim) self.sim_matrix = get_sim_matrix(self.df, self.sim_scores) self.aggcl = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='complete') self.labels = self.aggcl.fit_predict(self.sim_matrix)
def get_similar_strings(table_path): """ Get list of strings to be normalized by value normalizer. The current algorithm is as follows: 1. Deduplicate the given list of strings. 2. Apply string similar join. 3. Retrive the top N similar strings as returned by string similar join. Arguments: table_path: The absolute path of list of strings. Returns: similar_strings: Similar strings as returned by the aforementioned algorithm. Note: This logic can be changed to improve the value normalizer part of the overall application. """ A = pd.read_csv(table_path) B = pd.read_csv(table_path) qg3_tok = sm.QgramTokenizer(qval=3) output_pairs = ssj.jaccard_join(A, B, 'id', 'id', 'foo', 'foo', qg3_tok, 0.6, l_out_attrs=['foo'], r_out_attrs=['foo']) considered_pairs = [] similar_strings = [] for index, row in output_pairs.iterrows(): if row['_sim_score'] > 0.6 and row['_sim_score'] < 1.0: if row['l_foo'] not in similar_strings: similar_strings.append(row['l_foo']) if row['r_foo'] not in similar_strings: similar_strings.append(row['r_foo']) if len(similar_strings) >= 21: break similar_strings.sort() return similar_strings
def jaccard_similarity(self, dataset, threshold): df = add_key_reindex(dataset) # concatenate all columns and convert as one string # for each row with '*' as separator A = dataset.applymap(str) A = A.apply(lambda x: '*'.join(x.values.tolist()), axis=1) A = A.astype(str) A = A.str.replace(" ", "") df['row'] = A ssj.profile_table_for_join(df) ws = sm.WhitespaceTokenizer(return_set=True) # auto join output_pairs = ssj.jaccard_join(df, df, 'New_ID', 'New_ID', 'row', 'row', ws, threshold, l_out_attrs=['row'], r_out_attrs=['row'], n_jobs=-1) dup = output_pairs[output_pairs['l_New_ID'] != output_pairs['r_New_ID']] dataset = df[~df['New_ID'].isin(dup['r_New_ID'])] dataset.drop(["New_ID", "row"], axis=1, inplace=True) print("Number of duplicate rows removed:", len(set(dup['r_New_ID']))) return dataset
def compute_joins(A, B, idA, idB, keyA, keyB, merged, compute = 5): if compute == 5: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), 'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5), 'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 3: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), 'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5), 'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 2: results = { 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 1: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), } if compute == 0: results = { 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), } return results
def apply_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, n_jobs=1): if self.sim_measure_type == 'JACCARD': return ssj.jaccard_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'COSINE': return ssj.cosine_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'DICE': return ssj.dice_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'EDIT_DISTANCE': return ssj.edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP': return ssj.overlap_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP_COEFFICIENT': return ssj.overlap_coefficient_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs)
def time_all(iters=25): timings = defaultdict(list) join_types = [ 'key_edit', 'key_jac_ws', 'key_jac_2g', 'mer_jac_ws', 'mer_jac_2g' ] data_base = '/lfs/1/sahaana/enrichment/data' # Fuzzy Join A = data_base + "/main_fuzzy/test_tableA_processed.pkl" B = data_base + "/main_fuzzy/test_tableB_processed.pkl" supervision = data_base + "/main_fuzzy/supervision_test.pkl" idA = 'FUZZY_ID' idB = 'tconst' keyA = 'primaryTitle' keyB = 'primaryTitle' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('FJ') for j in range(len(join_types)): timings['data'].append('main_fuzzy') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # IMDB_wiki A = data_base + "/imdb_wiki/dev_tableA_processed.pkl" B = data_base + "/imdb_wiki/dev_tableB_processed.pkl" supervision = data_base + "/imdb_wiki/supervision_test.pkl" idA = 'IMDB_ID' idB = 'QID' keyA = 'primaryTitle' keyB = 'title' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('imdb_wiki') for j in range(len(join_types)): timings['data'].append('imdb_wiki') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # SQUAD A = data_base + "/SQuAD/dev_tableA_processed.pkl" B = data_base + "/SQuAD/dev_tableB_sent_processed.pkl" supervision = data_base + "/SQuAD/dev_sent_labels.pkl" idA = 'QID' idB = 'SID' keyA = 'question' keyB = 'sentence' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('SQUAD') for j in range(len(join_types)): timings['data'].append('squad') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # Deepmatchers dm_data = { 0: "joined_abt_buy_exp_data", 1: "joined_amazon_google_exp_data", 2: "joined_beer_exp_data", 4: "joined_dblp_acm_exp_data", 5: "joined_dblp_scholar_exp_data", 6: "joined_dirty_dblp_acm_exp_data", 7: "joined_dirty_dblp_scholar_exp_data", 8: "joined_dirty_itunes_amazon_exp_data", 9: "joined_dirty_walmart_amazon_exp_data", 10: "joined_fodors_zagat_exp_data", 11: "joined_itunes_amazon_exp_data", 12: "joined_walmart_amazon_exp_data", 30: "joined_company_exp_data" } dm_keys = { 0: "name", 1: "title", 2: "Beer_Name", 4: "title", 5: "title", 6: "title", 7: "title", 8: "Song_Name", 9: "title", 10: "name", 11: "Song_Name", 12: "title", 30: "content" } for i in dm_data: print() print(dm_data[i]) A = data_base + f"/dm_blocked/{dm_data[i]}/tableA_processed.pkl" B = data_base + f"/dm_blocked/{dm_data[i]}/tableB_processed.pkl" supervision = data_base + f"/dm_blocked/{dm_data[i]}/supervision_test.pkl" idA = 'id' idB = 'id' keyA = dm_keys[i] keyB = dm_keys[i] merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index('ltable_id') results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print(dm_data[i]) for j in range(len(join_types)): timings['data'].append(dm_data[i]) timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # MS Marco data_base = '/lfs/1/sahaana/enrichment/data/MSMARCO/' A = pd.read_pickle(data_base + 'dev_tableA_processed.pkl') B = pd.read_pickle(data_base + 'tableB_processed.pkl') supervision = pd.read_pickle(data_base + 'supervision_test.pkl').set_index('QID') tk = pd.read_csv(data_base + 'top1000.dev', sep='\t', names=['QID', 'PID', 'Query', 'Passage']) grouped_tk = tk.groupby('QID').agg(list) grouped_tk['Query'] = grouped_tk['Query'].apply(lambda x: x[0]) grouped_tk = grouped_tk.reset_index() ke = [] jkws = [] jk2g = [] jws = [] j2g = [] for _ in range(iters): tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.edit_distance_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', 100, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start ke.append(tt) tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.jaccard_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', WS, 0.05, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start jkws.append(tt) jws.append(jkws[-1]) tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.jaccard_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', TWO_GRAM, 0.05, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start jk2g.append(tt) j2g.append(jk2g[-1]) results = (ke, jkws, jk2g, jws, j2g) print('marco') for j in range(len(join_types)): timings['data'].append('marco') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() timings = pd.DataFrame(timings) timings.to_pickle( f'/lfs/1/sahaana/enrichment/ember/postprocessing/simjoin-iters_{iters}.pkl' )