Esempio n. 1
0
 def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "TFIDFRanker":
     model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'corpus.mm')) \
             or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = TFIDFRanker.build_index_mapping(names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                              num_features=mm_corpus.num_terms)
         ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                              model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
         ranker.persist(model_path)
         logging.info('TFIDFRanker : initialized')
         logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
         logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('TFIDFRanker : initialized')
         return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                            model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
Esempio n. 2
0
async def load_index():
    if 'index' not in model:
        index_file = await tasks['index']
        model['index'] = SparseMatrixSimilarity.load(index_file)
        for shard in model['index'].shards:
            shard.dirname = os.path.dirname(index_file)
    return model['index']
Esempio n. 3
0
    def load(self, dir_path):
        dir_path = Path(dir_path)

        vocab_path = str(dir_path / self.VOCAB_FNAME)
        model_path = str(dir_path / self.TFIDF_FNAME)
        index_path = str(dir_path / self.INDEX_FNAME)

        self.vocab = Dictionary.load(vocab_path)
        self.model = TfidfModel.load(model_path)
        self.index = SparseMatrixSimilarity.load(index_path)
Esempio n. 4
0
 def load_index(self, index_path, url_path):
     self.index = SparseMatrixSimilarity.load(index_path)
     self.urls = load_model(url_path)
Esempio n. 5
0
 def load(conf: Configuration,
          force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "BoolIWCSRanker":
     model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) or \
             (not os.path.isfile(model_path + 'inverted_index.pickle')) \
              or (not os.path.isfile(model_path + 'corpus.mm')) \
              or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         # Create the TFIDF model and dictionary
         dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([
             Ranker.get_text(conf, data)
             for (index, data) in dataset.iterrows()
         ])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)),
                        data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = BoolIWCSRanker.build_index_mapping(names)
         inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping(
             names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(
             tfidf_model[mm_corpus], num_features=mm_corpus.num_terms)
         logging.info('nBOWRanker : TFIDF initialized')
         logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model))
         logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index))
         # Create boolean index
         inverted_index = BoolIWCSRanker.inverted_index(conf, dataset)
         bool_dictionary = inverted_index.keys()
         # Load word2vec embedding and embed the corpus
         word2vec = KeyedVectors.load_word2vec_format(
             '../resources/embeddings/GoogleNews-vectors-negative300.bin',
             binary=True)
         tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]
         doc_embedding = BoolIWCSRanker.embed_corpus(
             tfidf_corpus, word2vec, dictionary)
         logging.info('nBOWRanker : Embedded docs shape : {}'.format(
             doc_embedding.shape))
         ranker = BoolIWCSRanker(inverted_index,
                                 bool_dictionary,
                                 conf,
                                 dictionary,
                                 bow_corpus,
                                 tfidf_model,
                                 tfidf_index,
                                 index_mapping,
                                 inverse_index_mapping,
                                 doc_embedding=doc_embedding,
                                 model_embedding=word2vec)
         ranker.persist(model_path)
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path +
                                              'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path +
                                                   'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('nBOWRanker : TFIDF indexmap initialized')
         with open(model_path + 'inverse_index_mapping.pickle',
                   mode='rb') as file:
             inverse_index_mapping = pickle.load(file)
             logging.info('nBOWRanker : TFIDF invindexmap initialized')
         with open(model_path + 'inverted_index.pickle', mode='rb') as file:
             inverted_index = pickle.load(file)
             bool_dictionary = inverted_index.keys()
         doc_embedding = np.load(model_path + 'doc_embedding.npy')
         logging.info('nBOWRanker : Doc embeddings loaded')
         word2vec = KeyedVectors.load_word2vec_format(
             '../resources/embeddings/GoogleNews-vectors-negative300.bin',
             binary=True)
         logging.info('nBOWRanker : Embedding model loaded')
         return BoolIWCSRanker(inverted_index,
                               bool_dictionary,
                               conf,
                               dictionary,
                               mm_corpus,
                               tfidf_model,
                               tfidf_index,
                               index_mapping,
                               inverse_index_mapping,
                               doc_embedding=doc_embedding,
                               model_embedding=word2vec)