def transform(self, corpus: Corpus): """ Ingest a corpus of documents using existing features. Requires that the embedding has been fitted beforehand. * TF-IDF embedding of documents is computed and stored. * TF-ITF embedding of features is computed and stored. Parameters ---------- corpus: :class:`~gismo.corpus.Corpus` The corpus to ingest. Example ------- >>> from gismo.common import toy_source_text >>> corpus=Corpus(toy_source_text) >>> embedding = IdfEmbedding() >>> embedding.fit_transform(corpus) >>> [embedding.features[i] for i in embedding.x.indices[:8]] ['gizmo', 'mogwaï', 'blade', 'sentence', 'sentence', 'shadoks', 'comparing', 'gizmo'] >>> small_corpus = Corpus(["I only talk about Yoda", "Gizmo forever!"]) >>> embedding.transform(small_corpus) >>> [embedding.features[i] for i in embedding.x.indices] ['yoda', 'gizmo'] """ # The fit part assert corpus # THE FIT PART # Start with a simple CountVectorizer X x = self.vectorizer.transform(corpus.iterate_text()) # Release stop_words_ from vectorizer self.vectorizer.stop_words_ = None # Extract number of documents and features (self.n, _) = x.shape # PART OF TRANSFORM, MUTUALIZED: Apply sublinear smoothing x.data = 1 + np.log(x.data) # Compute transposed CountVectorizer Y self.y = x.tocsc() # THE TRANSFORM PART idf_transform(indptr=self.y.indptr, data=self.y.data, idf_vector=self.idf) # back to x self.x = self.y.tocsr(copy=True) # Transpose y self.y = self.y.T # Normalize self.x_norm = l1_normalize(indptr=self.x.indptr, data=self.x.data) self.y_norm = l1_normalize(indptr=self.y.indptr, data=self.y.data)
def fit_transform(self, corpus: Corpus): """ Parameters ---------- corpus Returns ------- Examples -------- >>> from gismo.common import toy_source_text >>> corpus=Corpus(toy_source_text) >>> embedding = IdfEmbedding() >>> embedding.fit_transform(corpus) >>> embedding.x # doctest: +NORMALIZE_WHITESPACE <5x21 sparse matrix of type '<class 'numpy.float64'>' with 25 stored elements in Compressed Sparse Row format> >>> embedding.features[:8] ['blade', 'chinese', 'comparing', 'demon', 'folklore', 'gizmo', 'gremlins', 'inside'] The idf embedding behaves like the traditional embedding from documents to features, but it does not bias by document length from features to documents. >>> from gismo.embedding import Embedding >>> idtf_embedding = Embedding() >>> idtf_embedding.fit_transform(corpus) Observe the heterogeneous distribution on idtf and the uniform one on idf on the y side. >>> idtf_embedding.y[15, :].data array([0.46299901, 0.46299901, 0.07400197]) >>> embedding.y[15, :].data array([0.33333333, 0.33333333, 0.33333333]) On the x side, the embeddings are the same. >>> idtf_embedding.x[-1, :].data array([0.27541155, 0.27541155, 0.27541155, 0.17376534]) >>> embedding.x[-1, :].data array([0.27541155, 0.27541155, 0.27541155, 0.17376534]) """ if self.vectorizer is None: self.vectorizer = auto_vect(corpus) # THE FIT PART # Start with a simple CountVectorizer X x = self.vectorizer.fit_transform(corpus.iterate_text()) # Release stop_words_ from vectorizer self.vectorizer.stop_words_ = None # Populate vocabulary self.features = self.vectorizer.get_feature_names() # Extract number of documents and features (self.n, self.m) = x.shape # PART OF TRANSFORM, MUTUALIZED: Apply sublinear smoothing x.data = 1 + np.log(x.data) # Compute transposed CountVectorizer Y self.y = x.tocsc() # Compute IDF self.idf = idf_fit(self.y.indptr, self.n) # THE TRANSFORM PART idf_transform(indptr=self.y.indptr, data=self.y.data, idf_vector=self.idf) # back to x self.x = self.y.tocsr(copy=True) # Transpose y self.y = self.y.T # Normalize self.x_norm = l1_normalize(indptr=self.x.indptr, data=self.x.data) self.y_norm = l1_normalize(indptr=self.y.indptr, data=self.y.data)