Exemple #1
0
 def dump_paper_pairs(self, paper_pairs, wfname):
     paper_pairs_refactor = []
     for pair in paper_pairs:
         cpaper, npaper = pair
         pair_refactor = {'c': cpaper, 'n': npaper}
         paper_pairs_refactor.append(pair_refactor)
     data_utils.dump_json(paper_pairs_refactor, self.pairs_dir, wfname)
Exemple #2
0
def filter_hard_aff_pairs():
    df = pd.read_excel(join(settings.AFF_DATA_DIR, "mag_aminer_1(人工标注).xls"))
    # print(df)
    pairs = []
    for index, row in df.iterrows():
        mag_aff2 = row["mag_NormalizedName"].lower()
        aminer_aff2 = row["aminer_main_body"].lower()
        label = row["label"]
        common = set(aminer_aff2.split()).intersection(mag_aff2.split())
        if len(common) > 0:
            print(mag_aff2, "---", aminer_aff2, label)
            mag_aff1 = row["mag_DisplayName"].lower()
            aminer_aff1 = row["aminer_org_name"].lower()
            mag_id = row["mag_id"]
            aminer_id = row["aminer_id"]
            cur_dict = {
                "mag_affi": {
                    "id": mag_id,
                    "DisplayName": mag_aff1,
                    "NormalizedName": mag_aff2
                },
                "aminer_affi": {
                    "id": aminer_id,
                    "name": aminer_aff1,
                    "main_body": aminer_aff2
                },
                "label": label,
                "label_zfj": ""
            }
            pairs.append(cur_dict)

    print("n_pairs", len(pairs))
    data_utils.dump_json(pairs, settings.AFF_DATA_DIR,
                         "mag_aminer_hard_correct_zfj.json")
Exemple #3
0
def main():
    names = load_test_names()# 加载测试 作者名 列表
    ans = {}

    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8')# 结果保存 文件
    wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')#姓名, 论文数, 聚类数, 准确率, 召回, f1分数
    metrics = np.zeros(3)# 3个0
    cnt = 0
    for name in names:#枚举 姓名
        cur_metric, num_nodes, n_clusters, ans[name] = gae_for_na(name)#评估值[pre, rec, f1], 文档数, 聚类数
        if cur_metric == None:
            continue
        wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format(# 保存到文件
            name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2]))
        wf.flush()
        for i, m in enumerate(cur_metric): # 各评估值 求和 取平均
            metrics[i] += m
        cnt += 1
        macro_prec = metrics[0] / cnt
        macro_rec = metrics[1] / cnt
        macro_f1 = cal_f1(macro_prec, macro_rec)
        print('average until now', [macro_prec, macro_rec, macro_f1]) # 现在的 各宏-评估值, 计算到 当前name的
        time_acc = time.time()-start_time
        print(cnt, 'names', time_acc, 'avg time', time_acc/cnt)# 运算 的 时间
    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))# 最终 的 各宏-评估值
    wf.close() 

    dump_json(ans, settings.OUT_DIR, 'local_clustering_results.json', True) 
Exemple #4
0
 def build_inverted_index(self, fold):
     print('build inverted index for cpapers: fold', fold)
     fname = 'clean-papers-test-{}.dat'.format(fold)
     papers = data_utils.load_json_lines(self.paper_dir, fname)
     word2ids = dd(list)
     for paper in papers:
         pid = str(paper['id'])
         title = paper['title']
         words = data_utils.get_words(title, window=self.ii_window)
         for word in words:
             word2ids[word].append(pid)
     for word in word2ids:
         word2ids[word] = list(set(word2ids[word]))
     data_utils.dump_json(word2ids, self.inverted_index_dir,
                          'clean-papers-test-ii-{}.json'.format(fold))
     print('complete building II')
     return word2ids
def filter_venue_dataset():
    train_data = json.load(
        open(join(settings.VENUE_DATA_DIR, "train_copy_zfj.txt")))
    ddd = {}
    train_data_new = []
    for pair in train_data:
        cur = sorted(pair[1:])
        cur_key = "&&&".join(cur)

        if cur_key in ddd:
            print(cur_key)
        else:
            ddd[cur_key] = 1
            train_data_new.append(pair)
    print("size after filtering", len(train_data_new))
    data_utils.dump_json(train_data_new, settings.VENUE_DATA_DIR,
                         "train_filter.txt")
Exemple #6
0
def filter_aff_neg_pairs():
    neg_pairs = data_utils.load_json(settings.AFF_DATA_DIR,
                                     'train_negative_affi.json')
    neg_pairs_cleaned = []
    for i, pair in enumerate(neg_pairs):
        if i % 100 == 0:
            print("pair", i)
        mag_aff = pair["mag_affi"]
        aminer_aff = pair["aminer_affi"]
        aff1 = mag_aff["NormalizedName"].split()
        aff2 = aminer_aff["main_body"].split()
        common = set(aff1).intersection(aff2)
        if len(common) > 1:
            neg_pairs_cleaned.append(pair)
    print("after cleaned", len(neg_pairs_cleaned))
    data_utils.dump_json(neg_pairs_cleaned, settings.AFF_DATA_DIR,
                         "train_negative_affi_clean.json")
Exemple #7
0
def gen_venue_record_linkage_table():
    pairs = load_venue_data()
    train_num = 800
    test_num = 200

    n_pos_set = int((train_num + 2 * test_num) / 2)

    neg_pairs = [p for p in pairs if p[0] == 0]
    pos_pairs = [p for p in pairs if p[0] == 1][-n_pos_set:]
    n_pos = len(pos_pairs)
    neg_pairs = neg_pairs[-n_pos:]
    train_data = pos_pairs + neg_pairs

    train_data = sklearn.utils.shuffle(train_data, random_state=37)

    labels = [x[0] for x in train_data]

    # n = len(pairs)
    n_train = train_num
    n_valid = test_num
    aff_to_aid = {}
    cur_idx = 0
    # table1_aff = []
    # table2_aff = []
    out_dir = join(settings.OUT_DIR, "venue")
    wf1 = open(join(out_dir, "venue_train1.csv"), "w")
    wf2 = open(join(out_dir, "venue_train2.csv"), "w")
    wf1.write("name,main_body,uid\n")
    wf2.write("name,main_body,uid\n")
    test_pairs = []
    valid_pairs = []
    neg_cnt = 0
    # an = addressNormalization()
    for i, p in enumerate(train_data):
        # aff1_short = an.find_inst(p[0]["name"])[1].lower().replace(",", " ")
        aff1 = p[2].lower().replace(",", " ")
        # aff2_short = an.find_inst(p[1]["DisplayName"])[1].lower()
        aff2 = p[1].lower().replace(",", " ")
        label = labels[i]
        # if aff2 in aff_to_aid:
        #     continue
        if label == 1:
            aff_to_aid[aff2] = cur_idx
            aff_to_aid[aff1] = cur_idx
            cur_idx += 1
        else:
            aff_to_aid[aff2] = cur_idx
            aff_to_aid[aff1] = cur_idx + 1
            cur_idx += 2

        cur_v_mag = aff1.split()
        cur_v_aminer = aff2.split()
        overlap = set(cur_v_mag).intersection(cur_v_aminer)
        new_seq_mag = []
        new_seq_aminer = []
        for w in cur_v_mag:
            if w in overlap:
                new_seq_mag.append(w)
        for w in cur_v_aminer:
            if w in overlap:
                new_seq_aminer.append(w)

        if i < n_train:
            wf1.write(aff1 + "," + " ".join(new_seq_mag) + "," +
                      str(aff_to_aid[aff1]) + "\n")
            wf2.write(aff2 + "," + " ".join(new_seq_aminer) + "," +
                      str(aff_to_aid[aff2]) + "\n")
        elif i < n_train + n_valid:
            valid_pairs.append(({
                "name": aff1,
                "main_body": " ".join(new_seq_mag),
                "uid": str(aff_to_aid[aff1])
            }, {
                "name": aff2,
                "main_body": " ".join(new_seq_aminer),
                "uid": str(aff_to_aid[aff2])
            }))
        else:
            test_pairs.append(({
                "name": aff1,
                "main_body": " ".join(new_seq_mag),
                "uid": str(aff_to_aid[aff1])
            }, {
                "name": aff2,
                "main_body": " ".join(new_seq_aminer),
                "uid": str(aff_to_aid[aff2])
            }))
            if aff_to_aid[aff1] != aff_to_aid[aff2]:
                neg_cnt += 1
    wf1.close()
    wf2.close()

    print(len(test_pairs), neg_cnt)

    data_utils.dump_json(test_pairs, out_dir, "valid_venue_dedupe_pairs.json")
    data_utils.dump_json(test_pairs, out_dir, "test_venue_dedupe_pairs.json")
Exemple #8
0
def gen_aff_record_linkage_table():
    pairs, labels = load_aff_data()
    pairs, labels = sklearn.utils.shuffle(pairs, labels, random_state=42)
    n = len(pairs)
    n_train = int(n * 0.6)
    n_valid = int(n * 0.2)
    aff_to_aid = {}
    cur_idx = 0
    # table1_aff = []
    # table2_aff = []
    out_dir = join(settings.OUT_DIR, "aff")
    wf1 = open(join(out_dir, "aff_train1.csv"), "w")
    wf2 = open(join(out_dir, "aff_train2.csv"), "w")
    wf1.write("name,main_body,uid\n")
    wf2.write("name,main_body,uid\n")
    test_pairs = []
    valid_pairs = []
    neg_cnt = 0
    an = addressNormalization()
    for i, p in enumerate(pairs):
        aff1_short = an.find_inst(p[0]["name"])[1].lower().replace(",", " ")
        aff1 = p[0]["name"].lower().replace(",", " ")
        # aff2_short = an.find_inst(p[1]["DisplayName"])[1].lower()
        aff2 = p[1]["DisplayName"].lower().replace(",", " ")
        label = labels[i]
        # if aff2 in aff_to_aid:
        #     continue
        if label == 1:
            aff_to_aid[aff2] = cur_idx
            aff_to_aid[aff1] = cur_idx
            cur_idx += 1
        else:
            aff_to_aid[aff2] = cur_idx
            aff_to_aid[aff1] = cur_idx + 1
            cur_idx += 2
        if i < n_train:
            wf1.write(aff1 + "," + aff1_short + "," + str(aff_to_aid[aff1]) +
                      "\n")
            wf2.write(aff2 + "," + aff2 + "," + str(aff_to_aid[aff2]) + "\n")
        elif i < n_train + n_valid:
            valid_pairs.append(({
                "name": aff1,
                "main_body": aff1_short,
                "uid": str(aff_to_aid[aff1])
            }, {
                "name": aff2,
                "main_body": aff2,
                "uid": str(aff_to_aid[aff2])
            }))
        else:
            test_pairs.append(({
                "name": aff1,
                "main_body": aff1_short,
                "uid": str(aff_to_aid[aff1])
            }, {
                "name": aff2,
                "main_body": aff2,
                "uid": str(aff_to_aid[aff2])
            }))
            if aff_to_aid[aff1] != aff_to_aid[aff2]:
                neg_cnt += 1
    wf1.close()
    wf2.close()

    print(len(test_pairs), neg_cnt)

    data_utils.dump_json(test_pairs, out_dir, "valid_aff_dedupe_pairs.json")
    data_utils.dump_json(test_pairs, out_dir, "test_aff_dedupe_pairs.json")
Exemple #9
0
 def dump_hash_tables(self, role, fold):
     src_v, dst_v = self.vectors2hash_LSH_macro(role, fold)
     b2i_dict = self.build_binary2indices(dst_v)
     fname = '{}-title-hashtable-{}.json'.format(role, fold)
     data_utils.dump_json(b2i_dict, self.hash_table_dir, fname)