Esempio n. 1
0
    def transform(self, X, copy=True):
        """Transform a count matrix to a tf or tf-idf representation

        Parameters
        ----------
        X : array-like of (n_samples, n_features)
            A matrix of term/token counts
        copy : bool, default=True
            Whether to copy X and operate on the copy or perform in-place
            operations.

        Returns
        -------
        vectors : array-like of shape (n_samples, n_features)
        """
        if copy:
            X = X.copy()

        dtype = _get_dtype(X)

        X = self._convert_to_csr(X, dtype)
        if X.dtype != dtype:
            X = X.astype(dtype)

        n_samples, n_features = X.shape

        if self.sublinear_tf:
            cp.log(X.data, X.data)
            X.data += 1

        if self.use_idf:
            self._check_is_idf_fitted()

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" %
                                 (n_features, expected_n_features))

            csr_diag_mul(X, self._idf_diag, inplace=True)

        if self.norm:
            if self.norm == 'l1':
                csr_row_normalize_l1(X, inplace=True)
            elif self.norm == 'l2':
                csr_row_normalize_l2(X, inplace=True)

        return X
Esempio n. 2
0
    def transform(self, X: cupyx.scipy.sparse.csr_matrix, copy=True):
        """Transform a count-based matrix to c-TF-IDF
        Arguments:
            X (sparse matrix): A matrix of term/token counts.
        Returns:
            X (sparse matrix): A c-TF-IDF matrix
        """
        if self.use_idf:
            X = csr_row_normalize_l1(X, inplace=False)
            X = X * self._idf_diag

        return X
Esempio n. 3
0
    def transform(self, raw_documents):
        """
        Transform documents to document-term matrix.

        Extract token counts out of raw text documents using the vocabulary
        fitted with fit or the one provided to the constructor.

        Parameters
        ----------
        raw_documents : cudf.Series
            A Series of string documents

        Returns
        -------
        X : sparse CuPy CSR matrix of shape (n_samples, n_features)
            Document-term matrix.
        """
        docs = self._preprocess(raw_documents)
        del raw_documents
        n_doc = len(docs)
        tokenized_df = self._create_tokenized_df(docs)
        del docs
        count_df = self._count_hash(tokenized_df)
        del tokenized_df
        empty_doc_ids = self._compute_empty_doc_ids(count_df, n_doc)
        X = create_csr_matrix_from_count_df(count_df,
                                            empty_doc_ids,
                                            n_doc,
                                            self.n_features,
                                            dtype=self.dtype)

        if self.binary:
            X.data.fill(1)
        if self.norm:
            if self.norm == "l1":
                csr_row_normalize_l1(X, inplace=True)
            elif self.norm == "l2":
                csr_row_normalize_l2(X, inplace=True)

        return X