def __init__(self, vec_file, pap, pat, pro): # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True) self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format( vec_file, binary=True) self.paper_index = AnnoyIndexer() self.paper_index.load(pap) self.patent_index = AnnoyIndexer() self.patent_index.load(pat) self.project_index = AnnoyIndexer() self.project_index.load(pro) self.t2v = Convert2Vec(self.wm) self.cuttor = FilterCut() self.db = DB() self.featureIndex = self.buildFeatureIndex()
def get_author_by_sql(self, typee, ids): db = DB() return db.getAuthors(typee, ids) # 使用MySQL获取信息
def expertDocsSort(self, expertId, txt, topN): vec = self.t2v.text2v(txt, self.cuttor) annoy = AnnoyIndex(200) count = 0 annoy.add_item(count, vec) count = count + 1 db = DB() papers = db.getPapers(expertId) for p in papers: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 papers = sorted(papers, key=lambda p: p[3]) papersFormated = [] for p in papers: if len(papersFormated) == topN: break map = {} if p[0] is not None: map['paperId'] = p[0].encode('utf8') else: map['paperId'] = p[0] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[4] is not None: map['authors'] = p[4].encode('utf8') else: map['authors'] = p[4] if p[5] is not None: map['journalName'] = p[5].encode('utf8') else: map['journalName'] = p[5] if p[6] is not None: map['year'] = p[6].encode('utf8') else: map['year'] = p[6] papersFormated.append(map) count = 0 annoy.unload() annoy.add_item(count, vec) count = count + 1 patents = db.getPatents(expertId) for p in patents: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 patents = sorted(patents, key=lambda p: p[3]) patentsFormated = [] for p in patents: if len(patentsFormated) == topN: break map = {} if p[0] is not None: map['patentId'] = p[0].encode('utf8') else: map['patentId'] = p[0] if p[4] is not None: map['publicationNo'] = p[4].encode('utf8') else: map['publicationNo'] = p[4] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[5] is not None: map['inventors'] = p[5].encode('utf8') else: map['inventors'] = p[5] if p[6] is not None: map['applicant'] = p[6].encode('utf8') else: map['applicant'] = p[6] if p[7] is not None: map['year'] = p[7].encode('utf8') else: map['year'] = p[7] patentsFormated.append(map) count = 0 annoy.unload() annoy.add_item(count, vec) count = count + 1 projects = db.getProjects(expertId) for p in projects: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 projects = sorted(projects, key=lambda p: p[3]) projectsFormated = [] for p in projects: if len(projectsFormated) == topN: break map = {} if p[0] is not None: map['projectId'] = p[0].encode('utf8') else: map['projectId'] = p[0] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[4] is not None: map['member'] = p[4].encode('utf8') else: map['member'] = p[4] if p[5] is not None: map['unit'] = p[5].encode('utf8') else: map['unit'] = p[5] if p[6] is not None: map['year'] = p[6].encode('utf8') else: map['year'] = p[6] if p[7] is not None: map['type'] = p[7].encode('utf8') else: map['type'] = p[7] projectsFormated.append(map) result = {} result['papers'] = papersFormated result['patents'] = patentsFormated result['projects'] = projectsFormated return result