def train_paper(): things = [p["in_citations"] for p in col_paper.find()] things = flatten(things, dtype="s") things = list(set(things)) print(len(things)) p = [y['url_id'] for y in col_paper.find()] # 剔除已经存在的部分 things = [x for x in things if x not in p] # things = [x['_id'] for x in col_page.find({"used": False})] au = PaperPageThread(things, 56) s = time() au.start() e = time() print("The time spent on this program is %f s" % (e - s))
def ar_train_data(): process = 0 all = col_paper.count() papers = [x['url_id'] for x in col_paper.find()] with open("../data_set/ar_train.data", "w", encoding='utf-8') as f: for p in col_paper.find(no_cursor_timeout=True): authors = p['authors'] r_authors = [] in_citations = p['in_citations'] in_citations = [x for x in in_citations if x in papers] for oc in in_citations: oc_paper = col_paper.find_one({'url_id': oc}) r_authors += oc_paper['authors'] r_authors = list(set(r_authors)) sentence = " ".join(enlarge_author(authors, r_authors)) f.write(sentence + '\n') f.flush() process += 1 print("\rprocessing...\t%.2f%%" % (process / all * 100), end="")
def author(): things = [] for p in col_paper.find(): things += p['authors'] things = list(set(things)) au = AuthorPageThread(things, thread_num=56) s = time() au.start() e = time() print("The time spent on this program is %f s" % (e - s))
def idx_paper(): """ 重新对数据库中的所有文档赋予id值 :return: """ i = 1 for au in col_paper.find(): col_author.update_one({'_id': au['_id']}, {"$set": {'id': i}}) print("\r正在处理:", i, end='') i += 1 print("\n完成id重新赋值")
def init_data_net(self): # 从远程数据库中将文档的标题以及引文纳入训练 for p in col_paper.find(): sentence = segment(p['abstract']) + segment(p['title']) if sentence: self.data.append(sentence) self.data_label.append(p['url_id']) with open("CR/CR_data_lable.bin", 'wb') as f: # 将系统数据进行本地缓存,读取时可用data,data_label=pickle.load(open("CR_data_lable.bin", 'wb')) pickle.dump([self.data, self.data_label], f) self.paper_count = len(self.data_label)
def pr_train_data(): process = 0 all = col_paper.count() with open("../data_set/pr_train.data", "w", encoding='utf-8') as f: for p in col_paper.find(): url_id = [p['url_id']] in_citations = p['in_citations'] sentence = " ".join(enlarge_author(url_id, in_citations)) f.write(sentence + '\n') f.flush() process += 1 print("\rprocessing...\t%.2f%%" % (process / all * 100), end="")
def get_ar_authors(filter: {}): """ 根据相关条件查询文档中所有相关作者信息 :param filter: :return: """ rs = {} papers = [x['url_id'] for x in col_paper.find()] for p in col_paper.find(filter, no_cursor_timeout=True): authors = p['authors'] r_authors = [] in_citations = p['in_citations'] in_citations = [x for x in in_citations if x in papers] for oc in in_citations: oc_paper = col_paper.find_one({'url_id': oc}) r_authors += oc_paper['authors'] r_authors = list(set(r_authors)) ar_authors = list(set(r_authors + authors)) rs[p['url_id']] = [authors, r_authors, ar_authors] if filter == {}: with open("../data_set/ar_author.json", "w", encoding='utf-8') as f: json.dump(rs, f) return rs
def test(self): test_paper = [x['url_id'] for x in col_testPaper.find()] in_citations = [x['in_citations'] for x in col_paper.find()] total = 0 true = 0 for t in range(len(test_paper)): mode_sim = self.model.wv.most_similar(test_paper[t], topn=self.topn) call = [x[0] for x in mode_sim] inc = [x for x in in_citations[t] if x in self.sys_docs] total += len(inc) true = len([x for x in call if x in inc]) r_rate = float(true / total) t_rate = float(true / (len(test_paper) * self.topn)) print("召回率:%.2f%%" % r_rate) print("准确率:%.2f%%" % t_rate)
def test(self): test_paper = [x['url_id'] for x in col_testPaper.find()] in_citations = [x['in_citations'] for x in col_paper.find()] true = 0 total = 0 for i in range(len(test_paper)): t_vec = sent2vec(self.model, self.data[self.data_label.index(test_paper[i])]) call = self.most_sim(t_vec) inc = [x for x in in_citations[i] if x in self.sys_docs] inc_index = [self.data_label.index(x) for x in inc] total += len(inc) true += len([x for x in inc_index if x in call]) r_rate = float(true / total) t_rate = float(true / (len(test_paper) * self.topn)) print("召回率:%.2f%%" % r_rate) print("准确率:%.2f%%" % t_rate)
class CR: data = [] data_label = [] paper_count = 0 model = None sims = None # 目标文档与系统文档的相关性大小 topn = 100 # 返回的结果数 sort_sim = None # 相关性排序结果,只有文档id sys_docs = [x["url_id"] for x in col_paper.find()] sys_docs_cr = [] def __init__(self, topn=100, doc=None): """ :param topn:返回相关结果数 :param doc: 目标文档,包含标题与摘要组成的字符串 """ self.topn = topn if doc: self.aim_doc = segment(doc) self.aim_doc_vec = sent2vec(words=self.aim_doc) def init_data_net(self): # 从远程数据库中将文档的标题以及引文纳入训练 for p in col_paper.find(): sentence = segment(p['abstract']) + segment(p['title']) if sentence: self.data.append(sentence) self.data_label.append(p['url_id']) with open("CR/CR_data_lable.bin", 'wb') as f: # 将系统数据进行本地缓存,读取时可用data,data_label=pickle.load(open("CR_data_lable.bin", 'wb')) pickle.dump([self.data, self.data_label], f) self.paper_count = len(self.data_label) def init_data_local(self): """ 当本地已缓存有数据时,直接读取 :return: """ self.data, self.data_label = pickle.load( open("CR/CR_data_lable.bin", 'rb')) def get_sys_docs_cr(self): if self.sys_docs_cr: return self.sys_docs_cr for d in self.data: self.sys_docs_cr.append(sent2vec(self.model, d)) return self.sys_docs_cr def get_model(self): if not self.model: self.model = Doc2Vec.load("CR/CR.model") return self.model def train(self): sentences = LabeledLineSentence(self.data, self.data_label) # 迭代20次 self.model = Doc2Vec(size=100, window=10, min_count=3, workers=10, iter=20) self.model.build_vocab(sentences) print("开始训练...") # 训练模型 start = time.time() self.model.train(sentences, total_examples=self.model.corpus_count, epochs=12) self.model.save("CR/CR.model") self.model.save_word2vec_format("CR/CR.vector") print("模型已保存,共花时间:", time.time() - start) def most_sim(self, doc_vec): # 建立相关性索引 sims = sim(doc_vec, self.get_sys_docs_cr()) # 对相关度进行排序 call = np.argsort(-sims) return call[:self.topn] def test(self): test_paper = [x['url_id'] for x in col_testPaper.find()] in_citations = [x['in_citations'] for x in col_paper.find()] true = 0 total = 0 for i in range(len(test_paper)): t_vec = sent2vec(self.model, self.data[self.data_label.index(test_paper[i])]) call = self.most_sim(t_vec) inc = [x for x in in_citations[i] if x in self.sys_docs] inc_index = [self.data_label.index(x) for x in inc] total += len(inc) true += len([x for x in inc_index if x in call]) r_rate = float(true / total) t_rate = float(true / (len(test_paper) * self.topn)) print("召回率:%.2f%%" % r_rate) print("准确率:%.2f%%" % t_rate)
class PR: """ 输出模型文件: PR.model:模型参数 PR.vector:词向量 """ data_path = "../data_set/pr_train.data" model_path = "PR/PR.model" # 模型参数缓存文件路径 vector_path = "PR/PR.vector" # 词向量缓存路径 model = None sims = None # 目标文档与系统文档的相关性大小 topn = 100 # 返回的结果数 sort_sim = None # 相关性排序结果,只有文档idy sys_docs = [x["url_id"] for x in col_paper.find()] def __init__(self, topn=100): self.topn = topn def init_data_net(self): """ 生成ar.text文件 :return: """ pass def train(self): start = time.time() # 直接从文本文件中读取数据集 sentences = LineSentence(self.data_path) self.model = Word2Vec(sentences, min_count=2, sg=1, hs=1, size=100, window=5, workers=4, iter=20) self.model.save(self.model_path) self.model.wv.save_word2vec_format(self.vector_path, binary=False) print("模型已保存,训练共花时间:", time.time() - start) def test(self): test_paper = [x['url_id'] for x in col_testPaper.find()] in_citations = [x['in_citations'] for x in col_paper.find()] total = 0 true = 0 for t in range(len(test_paper)): mode_sim = self.model.wv.most_similar(test_paper[t], topn=self.topn) call = [x[0] for x in mode_sim] inc = [x for x in in_citations[t] if x in self.sys_docs] total += len(inc) true = len([x for x in call if x in inc]) r_rate = float(true / total) t_rate = float(true / (len(test_paper) * self.topn)) print("召回率:%.2f%%" % r_rate) print("准确率:%.2f%%" % t_rate) def get_model(self): if not self.model: self.model = Word2Vec.load("PR.model") return self.model