def __init__(self, archPath, modelPath):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # modelPath
        self.modelPath = modelPath

        self.model = doc2vecModel(self.corporaList, model_path=self.modelPath)
Exemple #2
0
    def __init__(self, archPath, model_weight=[0.25, 0.25, 0.25, 0.25]):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelWeight: list of model weight [BM25, JM, DIR, ABS]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.model = lmirBm25Model(self.corporaList,
                                   modelWeight=self.model_weight)
Exemple #3
0
    def __init__(self, archPath, modelPath, model_weight=[0.5, 0.5]):
        """
        初始化模型, 分别初始化两个: 
            1. lmirBM25
            2. Doc2Vec

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
            modelWeight: list of model weight[staticModel, featureModel]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.staticModel = lmirBm25Model(self.corporaList)
        self.featureModel = doc2vecModel(self.corporaList, modelPath)
    def retrieve(self, test_text, model_dm, corpus):
        test_vec = np.expand_dims(model_dm.infer_vector(test_text), axis=0)

        sim_array = np.zeros(len(corpus))
        for idx, sample in enumerate(corpus):
            sample_vec = np.expand_dims(model_dm.infer_vector(sample), axis=0)
            sim_array[idx] = cosine_similarity(test_vec, sample_vec)

        return sim_array


if __name__ == '__main__':
    # load archdataset
    ArchDataset = Arch(
        annotationFile="../../../Dataset/Arch/DemoData_20201228.json",
        imageFolder=None)
    ArchDataset.reverseCharForAllContext()

    # generate annotation and corpora list
    annIdList = []
    corporaList = []
    corporaList_d2v = []
    TaggededDocument = gensim.models.doc2vec.TaggedDocument  # 方便gensim用的文档对象

    for i, (annotation, content) in enumerate(ArchDataset.anns.items()):
        annIdList.append(annotation)
        corporaList.append(content["cutConcateText"])
        document = TaggededDocument(content["cutConcateText"], tags=[i])
        corporaList_d2v.append(document)