def __init__(self, archPath, modelPath): """ 初始化模型, 初始化一个: lmirBM25Model Input: archPath: archtectureDataset的数据位置 modelPath: doc2vec训练好的模型位置 """ self.archDataset = Arch(annotationFile=archPath) self.archDataset.reverseCharForAllContext() # generate annotation and corpora list self.annIdList = [] self.corporaList = [] self.notCutCorporaList = [] for annotation, content in self.archDataset.anns.items(): self.annIdList.append(annotation) self.corporaList.append(content["cutConcateText"]) self.notCutCorporaList.append(content["concateText"]) # modelPath self.modelPath = modelPath self.model = doc2vecModel(self.corporaList, model_path=self.modelPath)
def __init__(self, archPath, model_weight=[0.25, 0.25, 0.25, 0.25]): """ 初始化模型, 初始化一个: lmirBM25Model Input: archPath: archtectureDataset的数据位置 modelWeight: list of model weight [BM25, JM, DIR, ABS] """ self.archDataset = Arch(annotationFile=archPath) self.archDataset.reverseCharForAllContext() # generate annotation and corpora list self.annIdList = [] self.corporaList = [] self.notCutCorporaList = [] for annotation, content in self.archDataset.anns.items(): self.annIdList.append(annotation) self.corporaList.append(content["cutConcateText"]) self.notCutCorporaList.append(content["concateText"]) # model weight [staticModel, featureModel] self.model_weight = model_weight self.model = lmirBm25Model(self.corporaList, modelWeight=self.model_weight)
def __init__(self, archPath, modelPath, model_weight=[0.5, 0.5]): """ 初始化模型, 分别初始化两个: 1. lmirBM25 2. Doc2Vec Input: archPath: archtectureDataset的数据位置 modelPath: doc2vec训练好的模型位置 modelWeight: list of model weight[staticModel, featureModel] """ self.archDataset = Arch(annotationFile=archPath) self.archDataset.reverseCharForAllContext() # generate annotation and corpora list self.annIdList = [] self.corporaList = [] self.notCutCorporaList = [] for annotation, content in self.archDataset.anns.items(): self.annIdList.append(annotation) self.corporaList.append(content["cutConcateText"]) self.notCutCorporaList.append(content["concateText"]) # model weight [staticModel, featureModel] self.model_weight = model_weight self.staticModel = lmirBm25Model(self.corporaList) self.featureModel = doc2vecModel(self.corporaList, modelPath)
def retrieve(self, test_text, model_dm, corpus): test_vec = np.expand_dims(model_dm.infer_vector(test_text), axis=0) sim_array = np.zeros(len(corpus)) for idx, sample in enumerate(corpus): sample_vec = np.expand_dims(model_dm.infer_vector(sample), axis=0) sim_array[idx] = cosine_similarity(test_vec, sample_vec) return sim_array if __name__ == '__main__': # load archdataset ArchDataset = Arch( annotationFile="../../../Dataset/Arch/DemoData_20201228.json", imageFolder=None) ArchDataset.reverseCharForAllContext() # generate annotation and corpora list annIdList = [] corporaList = [] corporaList_d2v = [] TaggededDocument = gensim.models.doc2vec.TaggedDocument # 方便gensim用的文档对象 for i, (annotation, content) in enumerate(ArchDataset.anns.items()): annIdList.append(annotation) corporaList.append(content["cutConcateText"]) document = TaggededDocument(content["cutConcateText"], tags=[i]) corporaList_d2v.append(document)