class TfIdfGloveTransformer(): ''' word_embedder is pretrained gensim.KeyedVectors model dim is the dimension on word_embedder ''' def __init__(self, word_embedder, dim=300): self.word_embedder = word_embedder self.dim = dim self.word_dict = None self.bows = None self.tfidf = None self.token2id = None ''' Fits from corpus of tokenized documents. ''' def fit(self, X, y=None): self.word_dict = corpora.Dictionary(X, prune_at=None) self.bows = [self.word_dict.doc2bow(doc) for doc in X] self.tfidf = TfidfModel(self.bows, normalize=True) self.token2id = self.word_dict.token2id return self ''' returns embedding representation of documents in X ''' def transform(self, X): new_bows = [self.word_dict.doc2bow(doc) for doc in X] result = np.zeros((len(X), self.dim)) # perhaps this can be implemented better in a vectorial way for i, (doc, bow) in enumerate(zip(X, new_bows)): score_hash = { tup[0]: tup[1] for tup in self.tfidf.__getitem__(bow, -1) } # threshold weighted_embeddings = np.array([ np.dot(self.word_embedder[word], score_hash[self.token2id[word]]) if word in self.word_embedder else np.zeros((1, self.dim)) for word in doc ]) result[i] = np.sum(weighted_embeddings, axis=0) return result
class TFIDF(DocTerm): def __init__(self, preprocessor='stopwords~links+hashtags+mentions+lowercase', vocab_size=10000, max_df=0.5, min_df=5, **kwargs): super().__init__(preprocessor=preprocessor, vocab_size=vocab_size, max_df=max_df, min_df=min_df) # TODO: make tfidf args explicit self.__dict__.update(kwargs) self.tfidf_model = None def fit(self, corpus): docterm = super().fit(corpus) self.tfidf_model = TfidfModel(self.corpus_as_bow, id2word=self.vocab) self.corpus_as_tfidf = self.transform(corpus) return self def is_fit(self): return super().is_fit() and self.tfidf_model is not None def transform(self, corpus): if not self.is_fit(): self.fit(corpus) tokens = self.preprocessor.transform(corpus) bow = [self.vocab.doc2bow(doc) for doc in tokens] docs = [self.tfidf_model.__getitem__(doc) for doc in bow] return corpus2csc(docs, num_terms=len(self.vocab)).T def _make_main_table_elements(self): data, title = super()._make_main_table_elements() data['Weighting'] = ['TF-IDF'] return data, title
class LMDL_VSM(): def __init__(self): self.corpus = LMDL_Corpus() self.mapping = Dictionary(self.corpus.get_processed_documents()) self.bow_list = self._corpus_bow() self.inverted_index = self._inverted_index_matrix() self.tfidf_model = TfidfModel(self.bow_list, dictionary=self.mapping, id2word=self.mapping.token2id) self.log_inverted_index = log1p(self.inverted_index.tocsc()) def _corpus_bow(self): sorted_filename = self.corpus.get_filenames() ordered_bow = [] for filename in sorted_filename: dt = self.corpus.document_terms(filename) ordered_bow.append(self.mapping.doc2bow(dt)) return ordered_bow def _inverted_index_matrix(self): # term-document sparse matrix with TF weight sparse_matrix = corpus2csc(corpus=self.bow_list, num_terms=self.corpus.vocabulary_size(), num_docs=self.corpus.number_of_documents()) return sparse_matrix.tocsr() def verbose_inverted_sparse_index(self): inverted_index = {} filenames = self.corpus.get_filenames() for term in self.corpus.get_vocabulary(): if term in self.mapping.token2id.keys(): token_id = self.mapping.token2id[term] token_row = self.inverted_index.getrow( token_id).toarray().tolist() inverted_index[term] = list(zip(filenames, token_row[0])) return inverted_index def verbose_inverted_index(self): inverted_index = {} filenames = self.corpus.get_filenames() for term in self.corpus.get_vocabulary(): if term in self.mapping.token2id.keys(): token_id = self.mapping.token2id[term] token_row = self.inverted_index.getrow( token_id).toarray().tolist() sparse_zip = list(zip(filenames, token_row[0])) no_zero = [(ubication, weight) for (ubication, weight) in sparse_zip if weight > 0] inverted_index[term] = no_zero return inverted_index def documents_tfidf(self): filenames = self.corpus.get_filenames() for file in filenames: token_col = self.inverted_index.getcol( filenames.index(file)).toarray().tolist() sparse_bow = list( zip(list(range(0, self.corpus.vocabulary_size())), token_col)) sparse_bow = [(ubication, weight) for (ubication, [weight]) in sparse_bow] vector_list = self.tfidf_model.__getitem__(sparse_bow, eps=-1) vector_str = [str(weight) for (ubication, weight) in vector_list] export_to_txt('tfidf\\' + file.replace('html', 'txt'), ' '.join(vector_str)) def documents_log1p(self): filenames = self.corpus.get_filenames() for file in filenames: vector_list = self.log_inverted_index.getcol( filenames.index(file)).toarray().tolist() vector_str = [str(element) for [element] in vector_list] export_to_txt('locallog\\' + file.replace('html', 'txt'), ' '.join(vector_str))