def __init__(self, vec_file, pap, pat, pro):
     # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
     self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
         vec_file, binary=True)
     self.paper_index = AnnoyIndexer()
     self.paper_index.load(pap)
     self.patent_index = AnnoyIndexer()
     self.patent_index.load(pat)
     self.project_index = AnnoyIndexer()
     self.project_index.load(pro)
     self.t2v = Convert2Vec(self.wm)
     self.cuttor = FilterCut()
     self.db = DB()
     self.featureIndex = self.buildFeatureIndex()
예제 #2
0
 def get_author_by_sql(self, typee, ids):
     db = DB()
     return db.getAuthors(typee, ids)  # 使用MySQL获取信息
예제 #3
0
    def expertDocsSort(self, expertId, txt, topN):
        vec = self.t2v.text2v(txt, self.cuttor)
        annoy = AnnoyIndex(200)
        count = 0
        annoy.add_item(count, vec)
        count = count + 1
        db = DB()
        papers = db.getPapers(expertId)
        for p in papers:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        papers = sorted(papers, key=lambda p: p[3])
        papersFormated = []
        for p in papers:
            if len(papersFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['paperId'] = p[0].encode('utf8')
            else:
                map['paperId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['authors'] = p[4].encode('utf8')
            else:
                map['authors'] = p[4]
            if p[5] is not None:
                map['journalName'] = p[5].encode('utf8')
            else:
                map['journalName'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            papersFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        patents = db.getPatents(expertId)
        for p in patents:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        patents = sorted(patents, key=lambda p: p[3])
        patentsFormated = []
        for p in patents:
            if len(patentsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['patentId'] = p[0].encode('utf8')
            else:
                map['patentId'] = p[0]
            if p[4] is not None:
                map['publicationNo'] = p[4].encode('utf8')
            else:
                map['publicationNo'] = p[4]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[5] is not None:
                map['inventors'] = p[5].encode('utf8')
            else:
                map['inventors'] = p[5]
            if p[6] is not None:
                map['applicant'] = p[6].encode('utf8')
            else:
                map['applicant'] = p[6]
            if p[7] is not None:
                map['year'] = p[7].encode('utf8')
            else:
                map['year'] = p[7]
            patentsFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        projects = db.getProjects(expertId)
        for p in projects:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        projects = sorted(projects, key=lambda p: p[3])
        projectsFormated = []
        for p in projects:
            if len(projectsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['projectId'] = p[0].encode('utf8')
            else:
                map['projectId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['member'] = p[4].encode('utf8')
            else:
                map['member'] = p[4]
            if p[5] is not None:
                map['unit'] = p[5].encode('utf8')
            else:
                map['unit'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            if p[7] is not None:
                map['type'] = p[7].encode('utf8')
            else:
                map['type'] = p[7]
            projectsFormated.append(map)
        result = {}
        result['papers'] = papersFormated
        result['patents'] = patentsFormated
        result['projects'] = projectsFormated
        return result