def inverse_transform(self, y, delayed=True): """ Convert the data back to the original representation. In case unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. Parameters ---------- X : dask_cudf Series The string representation of the categories. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- X_tr : dask_cudf.Series Distributed object containing the inverse transformed array. """ if self._get_internal_model() is not None: return self._inverse_transform(y, delayed=delayed, output_collection_type='cudf') else: msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' " "with appropriate arguments before using this estimator.") raise NotFittedError(msg)
def transform(self, y, delayed=True): """ Transform an input into its categorical keys. This is intended for use with small inputs relative to the size of the dataset. For fitting and transforming an entire dataset, prefer `fit_transform`. Parameters ---------- y : dask_cudf.Series Input keys to be transformed. Its values should match the categories given to `fit` Returns ------- encoded : dask_cudf.Series The ordinally encoded input series Raises ------ KeyError if a category appears that was not seen in `fit` """ if self._get_internal_model() is not None: return self._transform(y, delayed=delayed, output_dtype='int32', output_collection_type='cudf') else: msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' " "with appropriate arguments before using this estimator.") raise NotFittedError(msg)
def sample(self, n_samples=1, random_state=None): """ Generate random samples from the model. Currently, this is implemented only for gaussian and tophat kernels, and the Euclidean metric. Parameters ---------- n_samples : int, default=1 Number of samples to generate. random_state : int, cupy RandomState instance or None, default=None Returns ------- X : cupy array of shape (n_samples, n_features) List of samples. """ if not hasattr(self, "X_"): raise NotFittedError() supported_kernels = ["gaussian", "tophat"] if (self.kernel not in supported_kernels or self.metric != "euclidean"): raise NotImplementedError( "Only {} kernels, and the euclidean" " metric are supported.".format(supported_kernels)) if isinstance(random_state, cp.random.RandomState): rng = random_state else: rng = cp.random.RandomState(random_state) u = rng.uniform(0, 1, size=n_samples) if self.sample_weight_ is None: i = (u * self.X_.shape[0]).astype(np.int64) else: cumsum_weight = cp.cumsum(self.sample_weight_) sum_weight = cumsum_weight[-1] i = cp.searchsorted(cumsum_weight, u * sum_weight) if self.kernel == "gaussian": return cp.atleast_2d(rng.normal(self.X_[i], self.bandwidth)) elif self.kernel == "tophat": # we first draw points from a d-dimensional normal distribution, # then use an incomplete gamma function to map them to a uniform # d-dimensional tophat distribution. has_scipy(raise_if_unavailable=True) dim = self.X_.shape[1] X = rng.normal(size=(n_samples, dim)) s_sq = cp.einsum("ij,ij->i", X, X).get() # do this on the CPU becaause we don't have # a gammainc function readily available correction = cp.array( gammainc(0.5 * dim, 0.5 * s_sq)**(1.0 / dim) * self.bandwidth / np.sqrt(s_sq)) return self.X_[i] + X * correction[:, np.newaxis]
def transform(self, raw_documents): """ Transform documents to document-term matrix. Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor. Parameters ---------- raw_documents : cudf.Series A Series of string documents Returns ------- X : cupy csr array of shape (n_samples, n_features) Document-term matrix. """ if not hasattr(self, "vocabulary_"): if self.vocabulary is not None: self.vocabulary_ = self.vocabulary else: raise NotFittedError() docs = self._preprocess(raw_documents) n_doc = len(docs) tokenized_df = self._create_tokenized_df(docs) count_df = self._count_vocab(tokenized_df) empty_doc_ids = self._compute_empty_doc_ids(count_df, n_doc) X = create_csr_matrix_from_count_df(count_df, empty_doc_ids, n_doc, len(self.vocabulary_), dtype=self.dtype) if self.binary: X.data.fill(1) return X
def _check_is_idf_fitted(self): if not hasattr(self, 'idf_'): msg = ("This TfidfTransformer instance is not fitted or the " "value of use_idf is not consistant between " ".fit() and .transform().") raise NotFittedError(msg)
def _check_is_fitted(self): if not self._fitted or self.train is None: msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' " "with appropriate arguments before using this estimator.") raise NotFittedError(msg)
def score_samples(self, X): """Compute the log-likelihood of each sample under the model. Parameters ---------- X : array-like of shape (n_samples, n_features) An array of points to query. Last dimension should match dimension of training data (n_features). Returns ------- density : ndarray of shape (n_samples,) Log-likelihood of each sample in `X`. These are normalized to be probability densities, so values will be low for high-dimensional data. """ if not hasattr(self, "X_"): raise NotFittedError() X_cuml = input_to_cuml_array(X) if self.metric_params: if len(self.metric_params) != 1: raise ValueError( "Cuml only supports metrics with a single arg.") metric_arg = list(self.metric_params.values())[0] distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric, metric_arg=metric_arg) else: distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric) distances = cp.asarray(distances) h = self.bandwidth if self.kernel in log_probability_kernels_: distances = log_probability_kernels_[self.kernel](distances, h) else: raise ValueError("Unsupported kernel.") log_probabilities = cp.zeros(distances.shape[0]) if self.sample_weight_ is not None: distances += cp.log(self.sample_weight_) logsumexp_kernel.forall(log_probabilities.size)(distances, log_probabilities) # Note that sklearns user guide is wrong # It says the (unnormalised) probability output for # the kernel density is sum(K(x,h)). # In fact what they implment is (1/n)*sum(K(x,h)) # Here we divide by n in normal probability space # Which becomes -log(n) in log probability space sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_ is not None else distances.shape[1]) log_probabilities -= np.log(sum_weights) # norm if len(X_cuml.array.shape) == 1: # if X is one dimensional, we have 1 feature dimension = 1 else: dimension = X_cuml.array.shape[1] log_probabilities = norm_log_probabilities(log_probabilities, self.kernel, h, dimension) return log_probabilities