def dump_paper_pairs(self, paper_pairs, wfname): paper_pairs_refactor = [] for pair in paper_pairs: cpaper, npaper = pair pair_refactor = {'c': cpaper, 'n': npaper} paper_pairs_refactor.append(pair_refactor) data_utils.dump_json(paper_pairs_refactor, self.pairs_dir, wfname)
def filter_hard_aff_pairs(): df = pd.read_excel(join(settings.AFF_DATA_DIR, "mag_aminer_1(人工标注).xls")) # print(df) pairs = [] for index, row in df.iterrows(): mag_aff2 = row["mag_NormalizedName"].lower() aminer_aff2 = row["aminer_main_body"].lower() label = row["label"] common = set(aminer_aff2.split()).intersection(mag_aff2.split()) if len(common) > 0: print(mag_aff2, "---", aminer_aff2, label) mag_aff1 = row["mag_DisplayName"].lower() aminer_aff1 = row["aminer_org_name"].lower() mag_id = row["mag_id"] aminer_id = row["aminer_id"] cur_dict = { "mag_affi": { "id": mag_id, "DisplayName": mag_aff1, "NormalizedName": mag_aff2 }, "aminer_affi": { "id": aminer_id, "name": aminer_aff1, "main_body": aminer_aff2 }, "label": label, "label_zfj": "" } pairs.append(cur_dict) print("n_pairs", len(pairs)) data_utils.dump_json(pairs, settings.AFF_DATA_DIR, "mag_aminer_hard_correct_zfj.json")
def main(): names = load_test_names()# 加载测试 作者名 列表 ans = {} wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8')# 结果保存 文件 wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')#姓名, 论文数, 聚类数, 准确率, 召回, f1分数 metrics = np.zeros(3)# 3个0 cnt = 0 for name in names:#枚举 姓名 cur_metric, num_nodes, n_clusters, ans[name] = gae_for_na(name)#评估值[pre, rec, f1], 文档数, 聚类数 if cur_metric == None: continue wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format(# 保存到文件 name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2])) wf.flush() for i, m in enumerate(cur_metric): # 各评估值 求和 取平均 metrics[i] += m cnt += 1 macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) print('average until now', [macro_prec, macro_rec, macro_f1]) # 现在的 各宏-评估值, 计算到 当前name的 time_acc = time.time()-start_time print(cnt, 'names', time_acc, 'avg time', time_acc/cnt)# 运算 的 时间 macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format( macro_prec, macro_rec, macro_f1))# 最终 的 各宏-评估值 wf.close() dump_json(ans, settings.OUT_DIR, 'local_clustering_results.json', True)
def build_inverted_index(self, fold): print('build inverted index for cpapers: fold', fold) fname = 'clean-papers-test-{}.dat'.format(fold) papers = data_utils.load_json_lines(self.paper_dir, fname) word2ids = dd(list) for paper in papers: pid = str(paper['id']) title = paper['title'] words = data_utils.get_words(title, window=self.ii_window) for word in words: word2ids[word].append(pid) for word in word2ids: word2ids[word] = list(set(word2ids[word])) data_utils.dump_json(word2ids, self.inverted_index_dir, 'clean-papers-test-ii-{}.json'.format(fold)) print('complete building II') return word2ids
def filter_venue_dataset(): train_data = json.load( open(join(settings.VENUE_DATA_DIR, "train_copy_zfj.txt"))) ddd = {} train_data_new = [] for pair in train_data: cur = sorted(pair[1:]) cur_key = "&&&".join(cur) if cur_key in ddd: print(cur_key) else: ddd[cur_key] = 1 train_data_new.append(pair) print("size after filtering", len(train_data_new)) data_utils.dump_json(train_data_new, settings.VENUE_DATA_DIR, "train_filter.txt")
def filter_aff_neg_pairs(): neg_pairs = data_utils.load_json(settings.AFF_DATA_DIR, 'train_negative_affi.json') neg_pairs_cleaned = [] for i, pair in enumerate(neg_pairs): if i % 100 == 0: print("pair", i) mag_aff = pair["mag_affi"] aminer_aff = pair["aminer_affi"] aff1 = mag_aff["NormalizedName"].split() aff2 = aminer_aff["main_body"].split() common = set(aff1).intersection(aff2) if len(common) > 1: neg_pairs_cleaned.append(pair) print("after cleaned", len(neg_pairs_cleaned)) data_utils.dump_json(neg_pairs_cleaned, settings.AFF_DATA_DIR, "train_negative_affi_clean.json")
def gen_venue_record_linkage_table(): pairs = load_venue_data() train_num = 800 test_num = 200 n_pos_set = int((train_num + 2 * test_num) / 2) neg_pairs = [p for p in pairs if p[0] == 0] pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:] n_pos = len(pos_pairs) neg_pairs = neg_pairs[-n_pos:] train_data = pos_pairs + neg_pairs train_data = sklearn.utils.shuffle(train_data, random_state=37) labels = [x[0] for x in train_data] # n = len(pairs) n_train = train_num n_valid = test_num aff_to_aid = {} cur_idx = 0 # table1_aff = [] # table2_aff = [] out_dir = join(settings.OUT_DIR, "venue") wf1 = open(join(out_dir, "venue_train1.csv"), "w") wf2 = open(join(out_dir, "venue_train2.csv"), "w") wf1.write("name,main_body,uid\n") wf2.write("name,main_body,uid\n") test_pairs = [] valid_pairs = [] neg_cnt = 0 # an = addressNormalization() for i, p in enumerate(train_data): # aff1_short = an.find_inst(p[0]["name"])[1].lower().replace(",", " ") aff1 = p[2].lower().replace(",", " ") # aff2_short = an.find_inst(p[1]["DisplayName"])[1].lower() aff2 = p[1].lower().replace(",", " ") label = labels[i] # if aff2 in aff_to_aid: # continue if label == 1: aff_to_aid[aff2] = cur_idx aff_to_aid[aff1] = cur_idx cur_idx += 1 else: aff_to_aid[aff2] = cur_idx aff_to_aid[aff1] = cur_idx + 1 cur_idx += 2 cur_v_mag = aff1.split() cur_v_aminer = aff2.split() overlap = set(cur_v_mag).intersection(cur_v_aminer) new_seq_mag = [] new_seq_aminer = [] for w in cur_v_mag: if w in overlap: new_seq_mag.append(w) for w in cur_v_aminer: if w in overlap: new_seq_aminer.append(w) if i < n_train: wf1.write(aff1 + "," + " ".join(new_seq_mag) + "," + str(aff_to_aid[aff1]) + "\n") wf2.write(aff2 + "," + " ".join(new_seq_aminer) + "," + str(aff_to_aid[aff2]) + "\n") elif i < n_train + n_valid: valid_pairs.append(({ "name": aff1, "main_body": " ".join(new_seq_mag), "uid": str(aff_to_aid[aff1]) }, { "name": aff2, "main_body": " ".join(new_seq_aminer), "uid": str(aff_to_aid[aff2]) })) else: test_pairs.append(({ "name": aff1, "main_body": " ".join(new_seq_mag), "uid": str(aff_to_aid[aff1]) }, { "name": aff2, "main_body": " ".join(new_seq_aminer), "uid": str(aff_to_aid[aff2]) })) if aff_to_aid[aff1] != aff_to_aid[aff2]: neg_cnt += 1 wf1.close() wf2.close() print(len(test_pairs), neg_cnt) data_utils.dump_json(test_pairs, out_dir, "valid_venue_dedupe_pairs.json") data_utils.dump_json(test_pairs, out_dir, "test_venue_dedupe_pairs.json")
def gen_aff_record_linkage_table(): pairs, labels = load_aff_data() pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42) n = len(pairs) n_train = int(n * 0.6) n_valid = int(n * 0.2) aff_to_aid = {} cur_idx = 0 # table1_aff = [] # table2_aff = [] out_dir = join(settings.OUT_DIR, "aff") wf1 = open(join(out_dir, "aff_train1.csv"), "w") wf2 = open(join(out_dir, "aff_train2.csv"), "w") wf1.write("name,main_body,uid\n") wf2.write("name,main_body,uid\n") test_pairs = [] valid_pairs = [] neg_cnt = 0 an = addressNormalization() for i, p in enumerate(pairs): aff1_short = an.find_inst(p[0]["name"])[1].lower().replace(",", " ") aff1 = p[0]["name"].lower().replace(",", " ") # aff2_short = an.find_inst(p[1]["DisplayName"])[1].lower() aff2 = p[1]["DisplayName"].lower().replace(",", " ") label = labels[i] # if aff2 in aff_to_aid: # continue if label == 1: aff_to_aid[aff2] = cur_idx aff_to_aid[aff1] = cur_idx cur_idx += 1 else: aff_to_aid[aff2] = cur_idx aff_to_aid[aff1] = cur_idx + 1 cur_idx += 2 if i < n_train: wf1.write(aff1 + "," + aff1_short + "," + str(aff_to_aid[aff1]) + "\n") wf2.write(aff2 + "," + aff2 + "," + str(aff_to_aid[aff2]) + "\n") elif i < n_train + n_valid: valid_pairs.append(({ "name": aff1, "main_body": aff1_short, "uid": str(aff_to_aid[aff1]) }, { "name": aff2, "main_body": aff2, "uid": str(aff_to_aid[aff2]) })) else: test_pairs.append(({ "name": aff1, "main_body": aff1_short, "uid": str(aff_to_aid[aff1]) }, { "name": aff2, "main_body": aff2, "uid": str(aff_to_aid[aff2]) })) if aff_to_aid[aff1] != aff_to_aid[aff2]: neg_cnt += 1 wf1.close() wf2.close() print(len(test_pairs), neg_cnt) data_utils.dump_json(test_pairs, out_dir, "valid_aff_dedupe_pairs.json") data_utils.dump_json(test_pairs, out_dir, "test_aff_dedupe_pairs.json")
def dump_hash_tables(self, role, fold): src_v, dst_v = self.vectors2hash_LSH_macro(role, fold) b2i_dict = self.build_binary2indices(dst_v) fname = '{}-title-hashtable-{}.json'.format(role, fold) data_utils.dump_json(b2i_dict, self.hash_table_dir, fname)