Esempio n. 1
0
def cal_f1_each_author(model_res, real_res):
    for author in model_res.keys():
        model_author = {author: model_res[author]}
        real_author = {author: real_res[author]}
        f1 = pairwise_f1(real_author, model_author)
        print(author, '真实聚类数', len(real_author[author]), '预测聚类数',
              len(model_author[author]), '%.2f' % f1)
Esempio n. 2
0
 def get_score(self,graph):
     """
     计算给定图的得分
     :param graph:
     :return:
     """
     real_cluster={self.author_name: self.tag}
     model_cluster={self.author_name: self.get_graph_components(graph)}
     f1 = pairwise_f1(real_cluster, model_cluster)
     print(self.author_name, 'RealNum', len(real_cluster[self.author_name]),
           'PreNum', len(model_cluster[self.author_name]),'F1','%.2f' % f1)
     return len(real_cluster[self.author_name]),len(model_cluster[self.author_name]),f1
Esempio n. 3
0
def cal_train_f1(eps_cluster, n_component, valid_data, tag):
    author_name_list = list(valid_data.keys())
    res_model_all = {}
    res_real_all = {}
    for i, author_name in enumerate(author_name_list):
        author_paper = valid_data[author_name]  # 同名作者文章列表
        if len(author_paper) == 0:
            print(author_name, '没有文章')
            continue
        res_model_author = {
            author_name: cluster_one_author(author_paper, 10, 0.5)
        }
        res_real_author = {author_name: tag[author_name]}
        # f1 = pairwise_f1(res_real_author, res_model_author)
        # print('i=', i, '同名作者:', author_name, '文章数量:', len(author_paper), '预测消歧作者数', len(res_model_author[author_name]),
        #       '实际消歧作者数', len(tag[author_name]), 'f1-value', f1)
        res_model_all.update(res_model_author)
        res_real_all.update(res_real_author)
    f1 = pairwise_f1(res_real_all, res_model_all)
    return n_component, eps_cluster, f1
Esempio n. 4
0
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 14 10:38:05 2019

@author: chizj
"""

from graph import get_train_data
from graph import GraphAuthors
from common import pairwise_f1

if __name__ == '__main__':
    a_list = ['li_guo', 'qiang_xu']
    train_data, train_tag = get_train_data(a_list)

    g_author = GraphAuthors(train_data['li_guo'], 'li_guo')
    g_author.add_edge()
    model_res = {'li_guo': g_author.get_connected_components()}
    real_res = {'li_guo': train_tag['li_guo']}

    pairwise_f1(real_res, model_res)
Esempio n. 5
0
    f1_dict = {}

    # 分析某个同名作者的文章信息
    author_list_all = list(tag.keys())
    author_list = author_list_all[1:20]
    for author_select in author_list:
        p_list = valid_data[author_select]
        pids, titles, keywords, abstract, authors, venue, year = get_paper_detail(
            p_list)

        real_result = {author_select: tag[author_select]}
        #        model_result={author_select:rand_cluster(p_list)}
        model_result = {
            author_select: co_author_cluster(pids, authors, author_select)
        }
        f1 = pairwise_f1(real_result, model_result)
        print(author_select, f1)
        f1_dict[author_select] = f1

#    org_test=[]
#    for co_au in authors:
#        for au in co_au:
#            if precessname(au['name']) == 'bo_shen':
#                oo=au['org'].split(';')[0]
#                if oo!='':org_test.append(oo)

    author_list_all = list(valid_data.keys())
    author_list = author_list_all[0:20]
    #   根据共同作者计算保留的图结构
    graph_dict = {}
    for author_select in author_list:
Esempio n. 6
0
        for plist in train_author_data[author].values():
            p_merge.append(plist)
        res_real[author] = p_merge

    papers = {}
    for author in author_selects:
        p_merge = []
        for key, value in train_author_data[author].items():
            for p in value:
                #                print(p)
                p_merge.append(train_pub_data[p])
        papers[author] = p_merge

    # 根据合作者的图模型进行的聚类(名字预处理的结果会好一些,0.25》0.21)
    res_model2 = disambiguate_by_graph(papers)
    pairwise_f1(res_real, res_model2)

    # 模型测试
    li_guo = papers['li_guo']
    paper_words = get_papar_words(li_guo)
    dictionary = corpora.Dictionary(paper_words)
    bow_corpus = [dictionary.doc2bow(wl) for wl in paper_words]  # 语料向量化
    tfidf = models.TfidfModel(bow_corpus)  # 基于向量化的语料构建tfidf模型

    index = similarities.Similarity('E:\\gensim_test', tfidf[bow_corpus],
                                    len(dictionary))
    sims = index[tfidf[bow_corpus]]  # 计算相似性矩阵
    i_cluster = graph_sim_matrix(sims, 0.15)
    res_modelx = {}
    res_modelx['li_guo'] = [[li_guo[index]['id'] for index in l_inside]
                            for l_inside in i_cluster]
"""
from PaperGraph import get_sample_data,get_valid_data
from common import pairwise_f1,pairwise_f1_new
import json

if __name__=='__main__':
    data,tag=get_sample_data()

    from PaperGraph import PaperGraph
    
    authors=data.keys()
    for author in authors:
        p_graph=PaperGraph(name=author)
        p_graph.set_paper_info(data[author])    
        con=p_graph.get_res1()
        f1=pairwise_f1({author:tag[author]},{author:con})
        print(author,'%.2f'%f1,len(tag[author]),len(con))
    
    for author in authors:
        p_graph=PaperGraph(name=author)
        p_graph.set_paper_info(data[author]) 
        p_graph.cal_node_pair_info()
        con=p_graph.get_res2()
        f1=pairwise_f1({author:tag[author]},{author:con})
        print(author,'%.2f'%f1,len(tag[author]),len(con))
    
    for author in authors:
        p_graph=PaperGraph(name=author)
        p_graph.set_paper_info(data[author]) 
        p_graph.cal_node_pair_info()
        con=p_graph.get_res3()
Esempio n. 8
0
    for author_select in author_list:
        real_res = {author_select: t_tag[author_select]}
        model_author = {
            author_select:
            graph_author_dict[author_select].get_connected_components()
        }
        #        model_author2={author_select: graph_author_dict2[author_select].get_connected_components()}
        #        model_title={author_select:graph_title_dict[author_select].get_connected_components()}
        #        model_at={author_select:graph_at_dict[author_select].get_connected_components()}
        #        model_a2t={author_select:graph_a2t_dict[author_select].get_connected_components()}
        #        model_at2={author_select:graph_at2_dict[author_select]}
        #        model_at21={author_select:graph_at21_dict[author_select].get_cluster2(0.005,100)}
        #        model_a2t21={author_select:graph_a2t21_dict[author_select].get_cluster2(0.1,500)}
        print(author_select, len(real_res[author_select]))
        print('----> model_author',
              '%.2f' % pairwise_f1(real_res, model_author),
              len(model_author[author_select]))
#        print('----> model_author2',
#              '%.2f'%pairwise_f1(real_res,model_author2),len(model_author2[author_select]))
#        print('----> model_title',
#              '%.2f'%pairwise_f1(real_res,model_title),len(model_title[author_select]))
#        print('----> model_at',
#              '%.2f'%pairwise_f1(real_res,model_at),len(model_at[author_select]))
#        print('----> model_a2t',
#              '%.2f'%pairwise_f1(real_res,model_a2t),len(model_a2t[author_select]))
#        print('----> model_at2',
#              '%.2f'%pairwise_f1(real_res,model_at2),len(model_at2[author_select]))
#        print('----> model_at21',
#              '%.2f'%pairwise_f1(real_res,model_at21),len(model_at21[author_select]))
#        print('----> model_a2t21',
#              '%.2f'%pairwise_f1(real_res,model_a2t21),len(model_a2t21[author_select]))
Esempio n. 9
0
    train_data = {}
    for author in author_selects:
        pid_list = [
            train_pub_data[p] for pp in train_author_data[author].values()
            for p in pp
        ]
        train_data[author] = pid_list

    real_result = {
        key: [pid for pid in train_author_data[key].values()]
        for key in author_selects
    }  # 训练数据真实标签

    modle_result = disambiguate_by_graph_model(train_data, 0.5, 2)
    pairwise_f1(real_result, modle_result)

    # 采用多进程加快速度
    train_data_list = []
    for i in range(6):
        train_data = {}
        for author in author_list[i:20:6]:
            pid_list = [
                train_pub_data[p] for pp in train_author_data[author].values()
                for p in pp
            ]
            train_data[author] = pid_list
        train_data_list.append(train_data)
    pool = Pool(processes=6)
    map_res = pool.map(disambiguate_by_graph_model, train_data_list)
    result_model = {}