def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "TFIDFRanker": model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) dataset = TFIDFRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = TFIDFRanker.build_index_mapping(names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus, model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf) ranker.persist(model_path) logging.info('TFIDFRanker : initialized') logging.info('TFIDFRanker : model : {}'.format(tfidf_model)) logging.info('TFIDFRanker : index : {}'.format(tfidf_index)) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('TFIDFRanker : initialized') return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus, model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
async def load_index(): if 'index' not in model: index_file = await tasks['index'] model['index'] = SparseMatrixSimilarity.load(index_file) for shard in model['index'].shards: shard.dirname = os.path.dirname(index_file) return model['index']
def load(self, dir_path): dir_path = Path(dir_path) vocab_path = str(dir_path / self.VOCAB_FNAME) model_path = str(dir_path / self.TFIDF_FNAME) index_path = str(dir_path / self.INDEX_FNAME) self.vocab = Dictionary.load(vocab_path) self.model = TfidfModel.load(model_path) self.index = SparseMatrixSimilarity.load(index_path)
def load_index(self, index_path, url_path): self.index = SparseMatrixSimilarity.load(index_path) self.urls = load_model(url_path)
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BoolIWCSRanker": model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) or \ (not os.path.isfile(model_path + 'inverted_index.pickle')) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) # Create the TFIDF model and dictionary dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([ Ranker.get_text(conf, data) for (index, data) in dataset.iterrows() ]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BoolIWCSRanker.build_index_mapping(names) inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping( names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity( tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) logging.info('nBOWRanker : TFIDF initialized') logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model)) logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index)) # Create boolean index inverted_index = BoolIWCSRanker.inverted_index(conf, dataset) bool_dictionary = inverted_index.keys() # Load word2vec embedding and embed the corpus word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus] doc_embedding = BoolIWCSRanker.embed_corpus( tfidf_corpus, word2vec, dictionary) logging.info('nBOWRanker : Embedded docs shape : {}'.format( doc_embedding.shape)) ranker = BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, bow_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec) ranker.persist(model_path) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF indexmap initialized') with open(model_path + 'inverse_index_mapping.pickle', mode='rb') as file: inverse_index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF invindexmap initialized') with open(model_path + 'inverted_index.pickle', mode='rb') as file: inverted_index = pickle.load(file) bool_dictionary = inverted_index.keys() doc_embedding = np.load(model_path + 'doc_embedding.npy') logging.info('nBOWRanker : Doc embeddings loaded') word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) logging.info('nBOWRanker : Embedding model loaded') return BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, mm_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec)