Ejemplo n.º 1
0
#    for index1,coauthor1 in enumerate(authors):
#        if index1==len(authors)-1: break
#        for index2,coauthor2 in enumerate(authors):
#            if index2<=index1:continue
##            num=get_coauthor_num(coauthor1,coauthor2,author_name)
#            num = get_coauthor_num2(coauthor1,coauthor2)-1
#            if num>=1:
#                graph.add_edge(pids[index1],pids[index2])

    conn_comp = list(nx.connected_components(graph))
    conn_comp = [list(c) for c in conn_comp]
    return conn_comp

if __name__ == '__main__':

    valid_data, tag = get_train_data()
    #    valid_data = get_valid_data()
    f1_dict = {}

    # 分析某个同名作者的文章信息
    author_list_all = list(tag.keys())
    author_list = author_list_all[1:20]
    for author_select in author_list:
        p_list = valid_data[author_select]
        pids, titles, keywords, abstract, authors, venue, year = get_paper_detail(
            p_list)

        real_result = {author_select: tag[author_select]}
        #        model_result={author_select:rand_cluster(p_list)}
        model_result = {
            author_select: co_author_cluster(pids, authors, author_select)
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 14 09:11:35 2019

@author: chizj
"""

from common import get_train_data

if __name__ == '__main__':
    train_data, train_tag = get_train_data(['li_guo'])
Ejemplo n.º 3
0
"""
Created on Wed Nov 13 11:24:47 2019

@author: chizj
"""
from common import get_train_data
my_stopwords = "a : “ ” , v the its and as on 's ( ) : % . based".split(' ')

if __name__ == '__main__':
    author_list = [
        'li_guo', 'bo_shen', 'di_wang', 'long_wang', 'qiang_xu', 'xiang_wang',
        'changming_liu', 'kenji_kaneko', 'guohua_chen', 'hai_jin', 'jia_li',
        'guoliang_li', 'lan_wang', 'alessandro_giuliani', 'jiang_he',
        'xiang_gao', 'jianping_wu', 'peng_shi', 'feng_wu', 'jing_zhu'
    ]
    train_data, train_tag = get_train_data(author_list)

    #    au='li_guo'

    from base import PaperGraph

    pg_dict = {}
    for au in author_list:
        pg = PaperGraph(au, train_data[au], train_tag[au])
        g1 = pg.get_graph_by_coauthor(pg.origin_graph)
        pg.get_score(g1)
        pg_dict[au] = (pg, g1)

    for au, v in pg_dict.items():
        v[0].get_score(v[1])
Ejemplo n.º 4
0
from common import get_train_data

import nltk
from gensim.utils import tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances

#nltk.download('stopwords')
#nltk.download('punkt')

my_stopwords = [
    'a', 'one', 'two', 'three', 'four', 'six', 'first', 'second', 'third', 'i',
    'h', 'l', 'c', 'via', 'iv'
]

train_data, train_tag = get_train_data()

# 做所有文章title的语料
paper_t_words = {}
for author, paper_list in train_data.items():
    print(author, len(paper_list))
    titles = {}
    for paper in paper_list:
        title = paper['title']
        words = tokenize(title)
        words = [w.lower() for w in words]
        words = [
            w for w in words if w not in nltk.corpus.stopwords.words('english')
        ]
        words = [w for w in words if w not in my_stopwords]
        titles[paper['id']] = words