def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "TFIDFRanker":
     model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'corpus.mm')) \
             or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = TFIDFRanker.build_index_mapping(names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                              num_features=mm_corpus.num_terms)
         ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                              model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
         ranker.persist(model_path)
         logging.info('TFIDFRanker : initialized')
         logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
         logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('TFIDFRanker : initialized')
         return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                            model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
 def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "BM25OkapiRanker":
     model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \
             or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf)
         bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = BM25OkapiRanker.build_index_mapping(names)
         bm25 = BM25Okapi(bow_corpus)
         logging.info('BM25OkapiRanker : initialized')
         bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
         bm25_ranker.persist(model_path)
         return bm25_ranker
     else:
         with open(model_path + 'bm25okapi.pickle', mode='rb') as file:
             bm25 = pickle.load(file)
             logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path))
         with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path))
         logging.info('BM25OkapiRanker : initialized')
         return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
 def get_probs_from_top(self, indexes: List[int]):
     top_indexes = indexes[:self.k]
     results = self.dataset.loc[self.dataset['fileindex'].isin(top_indexes),]
     words = []
     for index, data in results.iterrows():
         text = Ranker.get_text(self.conf, data)
         words.extend(text)
     counter = Counter(words)
     sum = len(words)
     probs = {word: (count / sum) for word, count in counter.items()}
     return probs
 def create_counter(conf: Configuration, dataset: pd.DataFrame) -> Tuple[Dict[Text, int], Dict[Text, float]]:
     text = [Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]
     counter = {}
     sum = 0
     for index, t_list in enumerate(text):
         sum += len(t_list)
         for token in t_list:
             if counter.get(token): counter[token] += 1
             else: counter[token] = 1
     probs = {token: (count / sum) for token, count in counter.items()}
     return counter, probs
Exemple #5
0
 def inverted_index(conf: Configuration,
                    dataset: pd.DataFrame) -> Dict[Text, Set[str]]:
     path_docs = conf.path_dsls
     inv_index = {}
     for index, data in dataset.iterrows():
         text = Ranker.get_text(conf, data)
         file_name = data['filename']
         for token in text:
             if not inv_index.get(token):
                 inv_index[token] = {file_name}
             else:
                 inv_index[token].add(file_name)
     return inv_index
Exemple #6
0
 def load(conf: Configuration,
          force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "BoolIWCSRanker":
     model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) or \
             (not os.path.isfile(model_path + 'inverted_index.pickle')) \
              or (not os.path.isfile(model_path + 'corpus.mm')) \
              or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         # Create the TFIDF model and dictionary
         dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([
             Ranker.get_text(conf, data)
             for (index, data) in dataset.iterrows()
         ])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)),
                        data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = BoolIWCSRanker.build_index_mapping(names)
         inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping(
             names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(
             tfidf_model[mm_corpus], num_features=mm_corpus.num_terms)
         logging.info('nBOWRanker : TFIDF initialized')
         logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model))
         logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index))
         # Create boolean index
         inverted_index = BoolIWCSRanker.inverted_index(conf, dataset)
         bool_dictionary = inverted_index.keys()
         # Load word2vec embedding and embed the corpus
         word2vec = KeyedVectors.load_word2vec_format(
             '../resources/embeddings/GoogleNews-vectors-negative300.bin',
             binary=True)
         tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]
         doc_embedding = BoolIWCSRanker.embed_corpus(
             tfidf_corpus, word2vec, dictionary)
         logging.info('nBOWRanker : Embedded docs shape : {}'.format(
             doc_embedding.shape))
         ranker = BoolIWCSRanker(inverted_index,
                                 bool_dictionary,
                                 conf,
                                 dictionary,
                                 bow_corpus,
                                 tfidf_model,
                                 tfidf_index,
                                 index_mapping,
                                 inverse_index_mapping,
                                 doc_embedding=doc_embedding,
                                 model_embedding=word2vec)
         ranker.persist(model_path)
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path +
                                              'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path +
                                                   'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('nBOWRanker : TFIDF indexmap initialized')
         with open(model_path + 'inverse_index_mapping.pickle',
                   mode='rb') as file:
             inverse_index_mapping = pickle.load(file)
             logging.info('nBOWRanker : TFIDF invindexmap initialized')
         with open(model_path + 'inverted_index.pickle', mode='rb') as file:
             inverted_index = pickle.load(file)
             bool_dictionary = inverted_index.keys()
         doc_embedding = np.load(model_path + 'doc_embedding.npy')
         logging.info('nBOWRanker : Doc embeddings loaded')
         word2vec = KeyedVectors.load_word2vec_format(
             '../resources/embeddings/GoogleNews-vectors-negative300.bin',
             binary=True)
         logging.info('nBOWRanker : Embedding model loaded')
         return BoolIWCSRanker(inverted_index,
                               bool_dictionary,
                               conf,
                               dictionary,
                               mm_corpus,
                               tfidf_model,
                               tfidf_index,
                               index_mapping,
                               inverse_index_mapping,
                               doc_embedding=doc_embedding,
                               model_embedding=word2vec)