def sample(self, n_samples=1, random_state=None): """ Generate random samples from the model. Currently, this is implemented only for gaussian and tophat kernels, and the Euclidean metric. Parameters ---------- n_samples : int, default=1 Number of samples to generate. random_state : int, cupy RandomState instance or None, default=None Returns ------- X : cupy array of shape (n_samples, n_features) List of samples. """ if not hasattr(self, "X_"): raise NotFittedError() supported_kernels = ["gaussian", "tophat"] if (self.kernel not in supported_kernels or self.metric != "euclidean"): raise NotImplementedError( "Only {} kernels, and the euclidean" " metric are supported.".format(supported_kernels)) if isinstance(random_state, cp.random.RandomState): rng = random_state else: rng = cp.random.RandomState(random_state) u = rng.uniform(0, 1, size=n_samples) if self.sample_weight_ is None: i = (u * self.X_.shape[0]).astype(np.int64) else: cumsum_weight = cp.cumsum(self.sample_weight_) sum_weight = cumsum_weight[-1] i = cp.searchsorted(cumsum_weight, u * sum_weight) if self.kernel == "gaussian": return cp.atleast_2d(rng.normal(self.X_[i], self.bandwidth)) elif self.kernel == "tophat": # we first draw points from a d-dimensional normal distribution, # then use an incomplete gamma function to map them to a uniform # d-dimensional tophat distribution. has_scipy(raise_if_unavailable=True) dim = self.X_.shape[1] X = rng.normal(size=(n_samples, dim)) s_sq = cp.einsum("ij,ij->i", X, X).get() # do this on the CPU becaause we don't have # a gammainc function readily available correction = cp.array( gammainc(0.5 * dim, 0.5 * s_sq)**(1.0 / dim) * self.bandwidth / np.sqrt(s_sq)) return self.X_[i] + X * correction[:, np.newaxis]
def __init__(self, data=None, convert_to_dtype=False, convert_index=np.int32, convert_format=True): if not cpx.scipy.sparse.isspmatrix(data) and \ not (has_scipy() and scipy.sparse.isspmatrix(data)): raise ValueError("A sparse matrix is expected as input. " "Received %s" % type(data)) check_classes = [cpx.scipy.sparse.csr_matrix] if has_scipy(): check_classes.append(scipy.sparse.csr_matrix) if not isinstance(data, tuple(check_classes)): if convert_format: debug('Received sparse matrix in %s format but CSR is ' 'expected. Data will be converted to CSR, but this ' 'will require additional memory copies. If this ' 'conversion is not desired, set ' 'set_convert_format=False to raise an exception ' 'instead.' % type(data)) data = data.tocsr() # currently only CSR is supported else: raise ValueError("Expected CSR matrix but received %s" % type(data)) if not convert_to_dtype: convert_to_dtype = data.dtype if not convert_index: convert_index = data.indptr.dtype # Note: Only 32-bit indexing is supported currently. # In CUDA11, Cusparse provides 64-bit function calls # but these are not yet used in RAFT/Cuml self.indptr, _, _, _ = cuml.common.input_to_cuml_array( data.indptr, check_dtype=convert_index, convert_to_dtype=convert_index) self.indices, _, _, _ = cuml.common.input_to_cuml_array( data.indices, check_dtype=convert_index, convert_to_dtype=convert_index) self.data, _, _, _ = cuml.common.input_to_cuml_array( data.data, check_dtype=data.dtype, convert_to_dtype=convert_to_dtype) self.shape = data.shape self.dtype = self.data.dtype self.nnz = data.nnz
def predict(self, X) -> CumlArray: """ Perform classification on an array of test vectors X. """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cupy_array(X, order='K').array jll = self._joint_log_likelihood(X) indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype) y_hat = invert_labels(indices, classes=self.classes_) return y_hat
def predict(self, X) -> CumlArray: """ Perform classification on an array of test vectors X. """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = _convert_x_sparse(X) else: X = input_to_cupy_array( X, order='K', check_dtype=[cp.float32, cp.float64, cp.int32]).array jll = self._joint_log_likelihood(X) indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype) y_hat = invert_labels(indices, classes=self.classes_) return y_hat
def to_output(self, output_type='cupy', output_format=None, output_dtype=None): """ Convert array to output format Parameters ---------- output_type : string Format to convert the array to. Acceptable formats are: - 'cupy' - to cupy array - 'scipy' - to scipy (host) array output_format : string, optional { 'coo', 'csc' } Optionally convert the output to the specified format. output_dtype : string, optional Optionally cast the array to a specified dtype, creating a copy if necessary. """ # Treat numpy and scipy as the same if (output_type == "numpy"): output_type = "scipy" output_dtype = self.data.dtype \ if output_dtype is None else output_dtype if output_type not in ['cupy', 'scipy']: raise ValueError("Unsupported output_type: %s" % output_dtype) cuml_arr_output_type = 'numpy' if output_type == 'scipy' else 'cupy' data = self.data.to_output(cuml_arr_output_type, output_dtype) indices = self.indices.to_output(cuml_arr_output_type) indptr = self.indptr.to_output(cuml_arr_output_type) if output_type == 'cupy': constructor = cpx.scipy.sparse.csr_matrix elif output_type == 'scipy' and has_scipy(raise_if_unavailable=True): constructor = scipy.sparse.csr_matrix else: raise ValueError("Unsupported output_type: %s" % output_type) ret = constructor((data, indices, indptr), dtype=output_dtype, shape=self.shape) if output_format is not None: if output_format == 'coo': ret = ret.tocoo() elif output_format == 'csc': ret = ret.tocsc() else: raise ValueError("Output format %s not supported" % output_format) return ret
def extract_knn_graph(knn_graph, convert_dtype=True, sparse=False): """ Converts KNN graph from CSR, COO and CSC formats into separate distance and indice arrays. Input can be a cupy sparse graph (device) or a numpy sparse graph (host). """ if has_scipy(): from scipy.sparse import csr_matrix, coo_matrix, csc_matrix else: from cuml.common.import_utils import DummyClass csr_matrix = DummyClass coo_matrix = DummyClass csc_matrix = DummyClass if isinstance(knn_graph, (csc_matrix, cp_csc_matrix)): knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) n_samples = knn_graph.shape[0] reordering = knn_graph.data.reshape((n_samples, -1)) reordering = reordering.argsort() n_neighbors = reordering.shape[1] reordering += (cp.arange(n_samples) * n_neighbors)[:, np.newaxis] reordering = reordering.flatten() knn_graph.indices = knn_graph.indices[reordering] knn_graph.data = knn_graph.data[reordering] knn_indices = None if isinstance(knn_graph, (csr_matrix, cp_csr_matrix)): knn_indices = knn_graph.indices elif isinstance(knn_graph, (coo_matrix, cp_coo_matrix)): knn_indices = knn_graph.col if knn_indices is not None: convert_to_dtype = None if convert_dtype: convert_to_dtype = np.int32 if sparse else np.int64 knn_dists = knn_graph.data knn_indices_m, _, _, _ = \ input_to_cuml_array(knn_indices, order='C', deepcopy=True, check_dtype=(np.int64, np.int32), convert_to_dtype=convert_to_dtype) knn_dists_m, _, _, _ = \ input_to_cuml_array(knn_dists, order='C', deepcopy=True, check_dtype=np.float32, convert_to_dtype=(np.float32 if convert_dtype else None)) return (knn_indices_m, knn_indices_m.ptr),\ (knn_dists_m, knn_dists_m.ptr) return (None, None), (None, None)
def _partial_fit(self, X, y, sample_weight=None, _classes=None) -> "MultinomialNB": if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cupy_array(X, order='K').array y = input_to_cupy_array(y).array Y, label_classes = make_monotonic(y, copy=True) if not self.fit_called_: self.fit_called_ = True if _classes is not None: _classes, *_ = input_to_cuml_array(_classes, order='K') check_labels(Y, _classes) self.classes_ = _classes else: self.classes_ = label_classes self._n_classes_ = self.classes_.shape[0] self._n_features_ = X.shape[1] self._init_counters(self._n_classes_, self._n_features_, X.dtype) else: check_labels(Y, self.classes_) self._count(X, Y) self._update_feature_log_prob(self.alpha) self._update_class_log_prior(class_prior=self._class_prior_) return self
def _sparsify_and_convert(data, input_type, sparsity_ratio=0.3): """Randomly set values to 0 and produce a sparse array.""" if not has_scipy(): raise RuntimeError("Scipy is required") import scipy random_loc = np.random.choice(data.size, int(data.size * sparsity_ratio), replace=False) data.ravel()[random_loc] = 0 if input_type == 'csr': return scipy.sparse.csr_matrix(data) elif input_type == 'csc': return scipy.sparse.csc_matrix(data) else: TypeError('Wrong sparse input type {}'.format(input_type))
def is_sparse(X): """ Return true if X is sparse, false otherwise. Parameters ---------- X : array-like, sparse-matrix Returns ------- is_sparse : boolean is the input sparse? """ is_scipy_sparse = (has_scipy() and scipy.sparse.isspmatrix(X)) return cupyx.scipy.sparse.isspmatrix(X) or is_scipy_sparse
def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. """ out_type = self._get_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') jll = self._joint_log_likelihood(X) # normalize by P(X) = P(f_1, ..., f_n) # Compute log(sum(exp())) # Subtract max in exp to prevent inf a_max = cp.amax(jll, axis=1, keepdims=True) exp = cp.exp(jll - a_max) logsumexp = cp.log(cp.sum(exp, axis=1)) a_max = cp.squeeze(a_max, axis=1) log_prob_x = a_max + logsumexp if log_prob_x.ndim < 2: log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0])) result = jll - log_prob_x.T return CumlArray(result).to_output(out_type)
def predict_log_proba(self, X) -> CumlArray: """ Return log-probability estimates for the test vector X. """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = _convert_x_sparse(X) else: X = input_to_cupy_array( X, order='K', check_dtype=[cp.float32, cp.float64, cp.int32]).array jll = self._joint_log_likelihood(X) # normalize by P(X) = P(f_1, ..., f_n) # Compute log(sum(exp())) # Subtract max in exp to prevent inf a_max = cp.amax(jll, axis=1, keepdims=True) exp = cp.exp(jll - a_max) logsumexp = cp.log(cp.sum(exp, axis=1)) a_max = cp.squeeze(a_max, axis=1) log_prob_x = a_max + logsumexp if log_prob_x.ndim < 2: log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0])) result = jll - log_prob_x.T return result
def test_binom_coef(): for i in range(1, 101): val = cuml.explainer.kernel_shap._binomCoef(100, i) if has_scipy(): from scipy.special import binom assert math.isclose(val, binom(100, i), rel_tol=1e-15)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import cupyx as cpx import numpy as np from cuml.common.import_utils import has_scipy from cuml.common.memory_utils import class_with_cupy_rmm from cuml.common.logger import debug import cuml.common if has_scipy(): import scipy.sparse @class_with_cupy_rmm() class SparseCumlArray(): """ SparseCumlArray abstracts sparse array allocations. This will accept either a Scipy or Cupy sparse array and construct CumlArrays out of the underlying index and data arrays. Currently, this class only supports the CSR array format and input in any other sparse format will be converted to CSR by default. Set `convert_format=False` to disable automatic conversion to CSR. Parameters ----------
def get_supported_input_type(X): """ Determines if the input object is a supported input array-like object or not. If supported, the type is returned. Otherwise, `None` is returned. Parameters ---------- X : object Input object to test Notes ----- To closely match the functionality of :func:`~cuml.common.input_utils.input_to_cuml_array`, this method will return ``cupy.ndarray`` for any object supporting `__cuda_array_interface__` and ``numpy.ndarray`` for any object supporting `__array_interface__`. Returns ------- array-like type or None If the array-like object is supported, the type is returned. Otherwise, `None` is returned. """ # Check CumlArray first to shorten search time if isinstance(X, CumlArray): return CumlArray if isinstance(X, SparseCumlArray): return SparseCumlArray if (isinstance(X, cudf.Series)): if X.null_count != 0: return None else: return cudf.Series # converting pandas to numpy before sending it to CumlArray if isinstance(X, pd.DataFrame): return pd.DataFrame if isinstance(X, pd.Series): return pd.Series if isinstance(X, cudf.DataFrame): return cudf.DataFrame if numba.cuda.devicearray.is_cuda_ndarray(X): return numba.cuda.devicearray.DeviceNDArrayBase if hasattr(X, "__cuda_array_interface__"): return cp.ndarray if hasattr(X, "__array_interface__"): # For some reason, numpy scalar types also implement # `__array_interface__`. See numpy.generic.__doc__. Exclude those types # as well as np.dtypes if (not isinstance(X, np.generic) and not isinstance(X, type)): return np.ndarray if cupyx.scipy.sparse.isspmatrix(X): return cupyx.scipy.sparse.spmatrix if has_scipy(): if (scipy.sparse.isspmatrix(X)): return scipy.sparse.spmatrix # Return None if this type isnt supported return None
def _partial_fit(self, X, y, sample_weight=None, _classes=None, convert_dtype=True) -> "MultinomialNB": if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = _convert_x_sparse(X) # TODO: Expanded this since sparse kernel doesn't # actually require the scipy sparse container format. else: X = input_to_cupy_array( X, order='K', check_dtype=[cp.float32, cp.float64, cp.int32]).array expected_y_dtype = cp.int32 if X.dtype in [cp.float32, cp.int32 ] else cp.int64 y = input_to_cupy_array( y, convert_to_dtype=(expected_y_dtype if convert_dtype else False), check_dtype=expected_y_dtype).array Y, label_classes = make_monotonic(y, copy=True) if not self.fit_called_: self.fit_called_ = True if _classes is not None: _classes, *_ = input_to_cuml_array( _classes, order='K', convert_to_dtype=(expected_y_dtype if convert_dtype else False)) check_labels(Y, _classes) self.classes_ = _classes else: self.classes_ = label_classes self._n_classes_ = self.classes_.shape[0] self._n_features_ = X.shape[1] self._init_counters(self._n_classes_, self._n_features_, X.dtype) else: check_labels(Y, self.classes_) if cp.sparse.isspmatrix(X): self._count_sparse(X.row, X.col, X.data, X.shape, Y) else: self._count(X, Y) self._update_feature_log_prob(self.alpha) self._update_class_log_prior(class_prior=self._class_prior_) return self