Ejemplo n.º 1
0
 def fit(self, X, y=None):
     x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist, keep_all=False)
     self.dictionary = corpora.Dictionary(x_clean)
     self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below)
     self.tfidf = models.TfidfModel([self.dictionary.doc2bow(text) for text in x_clean],
                                    id2word=self.dictionary, normalize=True)
     return self
Ejemplo n.º 2
0
 def transform(self, X):
     x_clean = tu.clean_and_tokenize(X, stoplist=self.stoplist)
     x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in x_clean]]
     x_data = matutils.corpus2csc(x_tfidf, num_terms=len(self.dictionary)).T
     #x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T
     #logging.info("Returning data of shape %s " % (len(x_data)))
     #returning a csr matrix
     return x_data