def train_paper():
    things = [p["in_citations"] for p in col_paper.find()]
    things = flatten(things, dtype="s")
    things = list(set(things))
    print(len(things))
    p = [y['url_id'] for y in col_paper.find()]
    # 剔除已经存在的部分
    things = [x for x in things if x not in p]
    # things = [x['_id'] for x in col_page.find({"used": False})]
    au = PaperPageThread(things, 56)
    s = time()
    au.start()
    e = time()
    print("The time spent on this program is %f s" % (e - s))
def ar_train_data():
    process = 0
    all = col_paper.count()
    papers = [x['url_id'] for x in col_paper.find()]
    with open("../data_set/ar_train.data", "w", encoding='utf-8') as f:
        for p in col_paper.find(no_cursor_timeout=True):
            authors = p['authors']
            r_authors = []
            in_citations = p['in_citations']
            in_citations = [x for x in in_citations if x in papers]
            for oc in in_citations:
                oc_paper = col_paper.find_one({'url_id': oc})
                r_authors += oc_paper['authors']
            r_authors = list(set(r_authors))
            sentence = " ".join(enlarge_author(authors, r_authors))
            f.write(sentence + '\n')
            f.flush()
            process += 1
            print("\rprocessing...\t%.2f%%" % (process / all * 100), end="")
def author():
    things = []
    for p in col_paper.find():
        things += p['authors']
    things = list(set(things))
    au = AuthorPageThread(things, thread_num=56)
    s = time()
    au.start()
    e = time()
    print("The time spent on this program is %f s" % (e - s))
Exemple #4
0
def idx_paper():
    """
        重新对数据库中的所有文档赋予id值
        :return:
        """
    i = 1
    for au in col_paper.find():
        col_author.update_one({'_id': au['_id']}, {"$set": {'id': i}})
        print("\r正在处理:", i, end='')
        i += 1
    print("\n完成id重新赋值")
Exemple #5
0
 def init_data_net(self):
     # 从远程数据库中将文档的标题以及引文纳入训练
     for p in col_paper.find():
         sentence = segment(p['abstract']) + segment(p['title'])
         if sentence:
             self.data.append(sentence)
             self.data_label.append(p['url_id'])
     with open("CR/CR_data_lable.bin", 'wb') as f:
         # 将系统数据进行本地缓存,读取时可用data,data_label=pickle.load(open("CR_data_lable.bin", 'wb'))
         pickle.dump([self.data, self.data_label], f)
     self.paper_count = len(self.data_label)
def pr_train_data():
    process = 0
    all = col_paper.count()
    with open("../data_set/pr_train.data", "w", encoding='utf-8') as f:
        for p in col_paper.find():
            url_id = [p['url_id']]
            in_citations = p['in_citations']
            sentence = " ".join(enlarge_author(url_id, in_citations))
            f.write(sentence + '\n')
            f.flush()
            process += 1
            print("\rprocessing...\t%.2f%%" % (process / all * 100), end="")
def get_ar_authors(filter: {}):
    """
    根据相关条件查询文档中所有相关作者信息
    :param filter:
    :return:
    """
    rs = {}
    papers = [x['url_id'] for x in col_paper.find()]
    for p in col_paper.find(filter, no_cursor_timeout=True):
        authors = p['authors']
        r_authors = []
        in_citations = p['in_citations']
        in_citations = [x for x in in_citations if x in papers]
        for oc in in_citations:
            oc_paper = col_paper.find_one({'url_id': oc})
            r_authors += oc_paper['authors']
        r_authors = list(set(r_authors))
        ar_authors = list(set(r_authors + authors))
        rs[p['url_id']] = [authors, r_authors, ar_authors]
    if filter == {}:
        with open("../data_set/ar_author.json", "w", encoding='utf-8') as f:
            json.dump(rs, f)
    return rs
Exemple #8
0
 def test(self):
     test_paper = [x['url_id'] for x in col_testPaper.find()]
     in_citations = [x['in_citations'] for x in col_paper.find()]
     total = 0
     true = 0
     for t in range(len(test_paper)):
         mode_sim = self.model.wv.most_similar(test_paper[t],
                                               topn=self.topn)
         call = [x[0] for x in mode_sim]
         inc = [x for x in in_citations[t] if x in self.sys_docs]
         total += len(inc)
         true = len([x for x in call if x in inc])
     r_rate = float(true / total)
     t_rate = float(true / (len(test_paper) * self.topn))
     print("召回率:%.2f%%" % r_rate)
     print("准确率:%.2f%%" % t_rate)
Exemple #9
0
 def test(self):
     test_paper = [x['url_id'] for x in col_testPaper.find()]
     in_citations = [x['in_citations'] for x in col_paper.find()]
     true = 0
     total = 0
     for i in range(len(test_paper)):
         t_vec = sent2vec(self.model,
                          self.data[self.data_label.index(test_paper[i])])
         call = self.most_sim(t_vec)
         inc = [x for x in in_citations[i] if x in self.sys_docs]
         inc_index = [self.data_label.index(x) for x in inc]
         total += len(inc)
         true += len([x for x in inc_index if x in call])
     r_rate = float(true / total)
     t_rate = float(true / (len(test_paper) * self.topn))
     print("召回率:%.2f%%" % r_rate)
     print("准确率:%.2f%%" % t_rate)
Exemple #10
0
class CR:
    data = []
    data_label = []
    paper_count = 0
    model = None
    sims = None  # 目标文档与系统文档的相关性大小
    topn = 100  # 返回的结果数
    sort_sim = None  # 相关性排序结果,只有文档id
    sys_docs = [x["url_id"] for x in col_paper.find()]
    sys_docs_cr = []

    def __init__(self, topn=100, doc=None):
        """

        :param topn:返回相关结果数
        :param doc: 目标文档,包含标题与摘要组成的字符串
        """
        self.topn = topn
        if doc:
            self.aim_doc = segment(doc)
            self.aim_doc_vec = sent2vec(words=self.aim_doc)

    def init_data_net(self):
        # 从远程数据库中将文档的标题以及引文纳入训练
        for p in col_paper.find():
            sentence = segment(p['abstract']) + segment(p['title'])
            if sentence:
                self.data.append(sentence)
                self.data_label.append(p['url_id'])
        with open("CR/CR_data_lable.bin", 'wb') as f:
            # 将系统数据进行本地缓存,读取时可用data,data_label=pickle.load(open("CR_data_lable.bin", 'wb'))
            pickle.dump([self.data, self.data_label], f)
        self.paper_count = len(self.data_label)

    def init_data_local(self):
        """
        当本地已缓存有数据时,直接读取
        :return:
        """
        self.data, self.data_label = pickle.load(
            open("CR/CR_data_lable.bin", 'rb'))

    def get_sys_docs_cr(self):
        if self.sys_docs_cr:
            return self.sys_docs_cr
        for d in self.data:
            self.sys_docs_cr.append(sent2vec(self.model, d))
        return self.sys_docs_cr

    def get_model(self):
        if not self.model:
            self.model = Doc2Vec.load("CR/CR.model")
        return self.model

    def train(self):
        sentences = LabeledLineSentence(self.data, self.data_label)
        # 迭代20次
        self.model = Doc2Vec(size=100,
                             window=10,
                             min_count=3,
                             workers=10,
                             iter=20)
        self.model.build_vocab(sentences)
        print("开始训练...")
        # 训练模型
        start = time.time()
        self.model.train(sentences,
                         total_examples=self.model.corpus_count,
                         epochs=12)

        self.model.save("CR/CR.model")
        self.model.save_word2vec_format("CR/CR.vector")
        print("模型已保存,共花时间:", time.time() - start)

    def most_sim(self, doc_vec):
        # 建立相关性索引
        sims = sim(doc_vec, self.get_sys_docs_cr())
        # 对相关度进行排序
        call = np.argsort(-sims)
        return call[:self.topn]

    def test(self):
        test_paper = [x['url_id'] for x in col_testPaper.find()]
        in_citations = [x['in_citations'] for x in col_paper.find()]
        true = 0
        total = 0
        for i in range(len(test_paper)):
            t_vec = sent2vec(self.model,
                             self.data[self.data_label.index(test_paper[i])])
            call = self.most_sim(t_vec)
            inc = [x for x in in_citations[i] if x in self.sys_docs]
            inc_index = [self.data_label.index(x) for x in inc]
            total += len(inc)
            true += len([x for x in inc_index if x in call])
        r_rate = float(true / total)
        t_rate = float(true / (len(test_paper) * self.topn))
        print("召回率:%.2f%%" % r_rate)
        print("准确率:%.2f%%" % t_rate)
Exemple #11
0
class PR:
    """
    输出模型文件:
    PR.model:模型参数
    PR.vector:词向量
    """
    data_path = "../data_set/pr_train.data"
    model_path = "PR/PR.model"  # 模型参数缓存文件路径
    vector_path = "PR/PR.vector"  # 词向量缓存路径
    model = None
    sims = None  # 目标文档与系统文档的相关性大小
    topn = 100  # 返回的结果数
    sort_sim = None  # 相关性排序结果,只有文档idy
    sys_docs = [x["url_id"] for x in col_paper.find()]

    def __init__(self, topn=100):
        self.topn = topn

    def init_data_net(self):
        """
        生成ar.text文件
        :return:
        """
        pass

    def train(self):
        start = time.time()
        # 直接从文本文件中读取数据集
        sentences = LineSentence(self.data_path)
        self.model = Word2Vec(sentences,
                              min_count=2,
                              sg=1,
                              hs=1,
                              size=100,
                              window=5,
                              workers=4,
                              iter=20)

        self.model.save(self.model_path)
        self.model.wv.save_word2vec_format(self.vector_path, binary=False)
        print("模型已保存,训练共花时间:", time.time() - start)

    def test(self):
        test_paper = [x['url_id'] for x in col_testPaper.find()]
        in_citations = [x['in_citations'] for x in col_paper.find()]
        total = 0
        true = 0
        for t in range(len(test_paper)):
            mode_sim = self.model.wv.most_similar(test_paper[t],
                                                  topn=self.topn)
            call = [x[0] for x in mode_sim]
            inc = [x for x in in_citations[t] if x in self.sys_docs]
            total += len(inc)
            true = len([x for x in call if x in inc])
        r_rate = float(true / total)
        t_rate = float(true / (len(test_paper) * self.topn))
        print("召回率:%.2f%%" % r_rate)
        print("准确率:%.2f%%" % t_rate)

    def get_model(self):
        if not self.model:
            self.model = Word2Vec.load("PR.model")
        return self.model