Exemple #1
0
    def transform(self, corpus: Corpus):
        """
        Ingest a corpus of documents using existing features.
        Requires that the embedding has been fitted beforehand.

        * TF-IDF embedding of documents is computed and stored.
        * TF-ITF embedding of features is computed and stored.

        Parameters
        ----------
        corpus: :class:`~gismo.corpus.Corpus`
            The corpus to ingest.

        Example
        -------
        >>> from gismo.common import toy_source_text
        >>> corpus=Corpus(toy_source_text)
        >>> embedding = IdfEmbedding()
        >>> embedding.fit_transform(corpus)
        >>> [embedding.features[i] for i in embedding.x.indices[:8]]
        ['gizmo', 'mogwaï', 'blade', 'sentence', 'sentence', 'shadoks', 'comparing', 'gizmo']
        >>> small_corpus = Corpus(["I only talk about Yoda", "Gizmo forever!"])
        >>> embedding.transform(small_corpus)
        >>> [embedding.features[i] for i in embedding.x.indices]
        ['yoda', 'gizmo']
        """
        # The fit part
        assert corpus

        # THE FIT PART
        # Start with a simple CountVectorizer X
        x = self.vectorizer.transform(corpus.iterate_text())
        # Release stop_words_ from vectorizer
        self.vectorizer.stop_words_ = None
        # Extract number of documents and features
        (self.n, _) = x.shape
        # PART OF TRANSFORM, MUTUALIZED: Apply sublinear smoothing
        x.data = 1 + np.log(x.data)
        # Compute transposed CountVectorizer Y
        self.y = x.tocsc()

        # THE TRANSFORM PART
        idf_transform(indptr=self.y.indptr,
                      data=self.y.data,
                      idf_vector=self.idf)
        # back to x
        self.x = self.y.tocsr(copy=True)
        # Transpose y
        self.y = self.y.T
        # Normalize
        self.x_norm = l1_normalize(indptr=self.x.indptr, data=self.x.data)
        self.y_norm = l1_normalize(indptr=self.y.indptr, data=self.y.data)
Exemple #2
0
    def fit_transform(self, corpus: Corpus):
        """


        Parameters
        ----------
        corpus

        Returns
        -------

        Examples
        --------
        >>> from gismo.common import toy_source_text
        >>> corpus=Corpus(toy_source_text)
        >>> embedding = IdfEmbedding()
        >>> embedding.fit_transform(corpus)
        >>> embedding.x  # doctest: +NORMALIZE_WHITESPACE
        <5x21 sparse matrix of type '<class 'numpy.float64'>'
            with 25 stored elements in Compressed Sparse Row format>
        >>> embedding.features[:8]
        ['blade', 'chinese', 'comparing', 'demon', 'folklore', 'gizmo', 'gremlins', 'inside']

        The idf embedding  behaves like the traditional embedding from documents to features,
        but it does not bias by document length from features to documents.

        >>> from gismo.embedding import Embedding
        >>> idtf_embedding = Embedding()
        >>> idtf_embedding.fit_transform(corpus)

        Observe the heterogeneous distribution on idtf and the uniform one on idf on the y side.

        >>> idtf_embedding.y[15, :].data
        array([0.46299901, 0.46299901, 0.07400197])
        >>> embedding.y[15, :].data
        array([0.33333333, 0.33333333, 0.33333333])

        On the x side, the embeddings are the same.

        >>> idtf_embedding.x[-1, :].data
        array([0.27541155, 0.27541155, 0.27541155, 0.17376534])
        >>> embedding.x[-1, :].data
        array([0.27541155, 0.27541155, 0.27541155, 0.17376534])
        """
        if self.vectorizer is None:
            self.vectorizer = auto_vect(corpus)

        # THE FIT PART
        # Start with a simple CountVectorizer X
        x = self.vectorizer.fit_transform(corpus.iterate_text())
        # Release stop_words_ from vectorizer
        self.vectorizer.stop_words_ = None
        # Populate vocabulary
        self.features = self.vectorizer.get_feature_names()
        # Extract number of documents and features
        (self.n, self.m) = x.shape
        # PART OF TRANSFORM, MUTUALIZED: Apply sublinear smoothing
        x.data = 1 + np.log(x.data)
        # Compute transposed CountVectorizer Y
        self.y = x.tocsc()
        # Compute IDF
        self.idf = idf_fit(self.y.indptr, self.n)

        # THE TRANSFORM PART
        idf_transform(indptr=self.y.indptr,
                      data=self.y.data,
                      idf_vector=self.idf)
        # back to x
        self.x = self.y.tocsr(copy=True)
        # Transpose y
        self.y = self.y.T
        # Normalize
        self.x_norm = l1_normalize(indptr=self.x.indptr, data=self.x.data)
        self.y_norm = l1_normalize(indptr=self.y.indptr, data=self.y.data)