# for index1,coauthor1 in enumerate(authors): # if index1==len(authors)-1: break # for index2,coauthor2 in enumerate(authors): # if index2<=index1:continue ## num=get_coauthor_num(coauthor1,coauthor2,author_name) # num = get_coauthor_num2(coauthor1,coauthor2)-1 # if num>=1: # graph.add_edge(pids[index1],pids[index2]) conn_comp = list(nx.connected_components(graph)) conn_comp = [list(c) for c in conn_comp] return conn_comp if __name__ == '__main__': valid_data, tag = get_train_data() # valid_data = get_valid_data() f1_dict = {} # 分析某个同名作者的文章信息 author_list_all = list(tag.keys()) author_list = author_list_all[1:20] for author_select in author_list: p_list = valid_data[author_select] pids, titles, keywords, abstract, authors, venue, year = get_paper_detail( p_list) real_result = {author_select: tag[author_select]} # model_result={author_select:rand_cluster(p_list)} model_result = { author_select: co_author_cluster(pids, authors, author_select)
# -*- coding: utf-8 -*- """ Created on Thu Nov 14 09:11:35 2019 @author: chizj """ from common import get_train_data if __name__ == '__main__': train_data, train_tag = get_train_data(['li_guo'])
""" Created on Wed Nov 13 11:24:47 2019 @author: chizj """ from common import get_train_data my_stopwords = "a : “ ” , v the its and as on 's ( ) : % . based".split(' ') if __name__ == '__main__': author_list = [ 'li_guo', 'bo_shen', 'di_wang', 'long_wang', 'qiang_xu', 'xiang_wang', 'changming_liu', 'kenji_kaneko', 'guohua_chen', 'hai_jin', 'jia_li', 'guoliang_li', 'lan_wang', 'alessandro_giuliani', 'jiang_he', 'xiang_gao', 'jianping_wu', 'peng_shi', 'feng_wu', 'jing_zhu' ] train_data, train_tag = get_train_data(author_list) # au='li_guo' from base import PaperGraph pg_dict = {} for au in author_list: pg = PaperGraph(au, train_data[au], train_tag[au]) g1 = pg.get_graph_by_coauthor(pg.origin_graph) pg.get_score(g1) pg_dict[au] = (pg, g1) for au, v in pg_dict.items(): v[0].get_score(v[1])
from common import get_train_data import nltk from gensim.utils import tokenize from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import pairwise_distances #nltk.download('stopwords') #nltk.download('punkt') my_stopwords = [ 'a', 'one', 'two', 'three', 'four', 'six', 'first', 'second', 'third', 'i', 'h', 'l', 'c', 'via', 'iv' ] train_data, train_tag = get_train_data() # 做所有文章title的语料 paper_t_words = {} for author, paper_list in train_data.items(): print(author, len(paper_list)) titles = {} for paper in paper_list: title = paper['title'] words = tokenize(title) words = [w.lower() for w in words] words = [ w for w in words if w not in nltk.corpus.stopwords.words('english') ] words = [w for w in words if w not in my_stopwords] titles[paper['id']] = words