def fit(self, X, y=None): x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist, keep_all=False) self.dictionary = corpora.Dictionary(x_clean) self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below) self.tfidf = models.TfidfModel([self.dictionary.doc2bow(text) for text in x_clean], id2word=self.dictionary, normalize=True) return self
def transform(self, X): x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist) x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in x_clean]] x_data = matutils.corpus2csc(x_tfidf, num_terms=len(self.dictionary)).T #x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T #logging.info("Returning data of shape %s " % (len(x_data))) #returning a csr matrix return x_data