def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "TFIDFRanker": model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) dataset = TFIDFRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = TFIDFRanker.build_index_mapping(names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus, model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf) ranker.persist(model_path) logging.info('TFIDFRanker : initialized') logging.info('TFIDFRanker : model : {}'.format(tfidf_model)) logging.info('TFIDFRanker : index : {}'.format(tfidf_index)) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('TFIDFRanker : initialized') return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus, model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BM25OkapiRanker": model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \ or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')): utils.mk_dir_if_not_exists(model_path) dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf) bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BM25OkapiRanker.build_index_mapping(names) bm25 = BM25Okapi(bow_corpus) logging.info('BM25OkapiRanker : initialized') bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf) bm25_ranker.persist(model_path) return bm25_ranker else: with open(model_path + 'bm25okapi.pickle', mode='rb') as file: bm25 = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path)) with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path)) logging.info('BM25OkapiRanker : initialized') return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
def get_probs_from_top(self, indexes: List[int]): top_indexes = indexes[:self.k] results = self.dataset.loc[self.dataset['fileindex'].isin(top_indexes),] words = [] for index, data in results.iterrows(): text = Ranker.get_text(self.conf, data) words.extend(text) counter = Counter(words) sum = len(words) probs = {word: (count / sum) for word, count in counter.items()} return probs
def create_counter(conf: Configuration, dataset: pd.DataFrame) -> Tuple[Dict[Text, int], Dict[Text, float]]: text = [Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()] counter = {} sum = 0 for index, t_list in enumerate(text): sum += len(t_list) for token in t_list: if counter.get(token): counter[token] += 1 else: counter[token] = 1 probs = {token: (count / sum) for token, count in counter.items()} return counter, probs
def inverted_index(conf: Configuration, dataset: pd.DataFrame) -> Dict[Text, Set[str]]: path_docs = conf.path_dsls inv_index = {} for index, data in dataset.iterrows(): text = Ranker.get_text(conf, data) file_name = data['filename'] for token in text: if not inv_index.get(token): inv_index[token] = {file_name} else: inv_index[token].add(file_name) return inv_index
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BoolIWCSRanker": model_path = conf.path_models + 'bool_iwcs/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) or \ (not os.path.isfile(model_path + 'inverted_index.pickle')) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) # Create the TFIDF model and dictionary dataset = BoolIWCSRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([ Ranker.get_text(conf, data) for (index, data) in dataset.iterrows() ]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BoolIWCSRanker.build_index_mapping(names) inverse_index_mapping = BoolIWCSRanker.build_inverse_index_mapping( names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity( tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) logging.info('nBOWRanker : TFIDF initialized') logging.info('nBOWRanker : TFIDF model : {}'.format(tfidf_model)) logging.info('nBOWRanker : TFIDF index : {}'.format(tfidf_index)) # Create boolean index inverted_index = BoolIWCSRanker.inverted_index(conf, dataset) bool_dictionary = inverted_index.keys() # Load word2vec embedding and embed the corpus word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus] doc_embedding = BoolIWCSRanker.embed_corpus( tfidf_corpus, word2vec, dictionary) logging.info('nBOWRanker : Embedded docs shape : {}'.format( doc_embedding.shape)) ranker = BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, bow_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec) ranker.persist(model_path) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF indexmap initialized') with open(model_path + 'inverse_index_mapping.pickle', mode='rb') as file: inverse_index_mapping = pickle.load(file) logging.info('nBOWRanker : TFIDF invindexmap initialized') with open(model_path + 'inverted_index.pickle', mode='rb') as file: inverted_index = pickle.load(file) bool_dictionary = inverted_index.keys() doc_embedding = np.load(model_path + 'doc_embedding.npy') logging.info('nBOWRanker : Doc embeddings loaded') word2vec = KeyedVectors.load_word2vec_format( '../resources/embeddings/GoogleNews-vectors-negative300.bin', binary=True) logging.info('nBOWRanker : Embedding model loaded') return BoolIWCSRanker(inverted_index, bool_dictionary, conf, dictionary, mm_corpus, tfidf_model, tfidf_index, index_mapping, inverse_index_mapping, doc_embedding=doc_embedding, model_embedding=word2vec)