Esempio n. 1
0
    def sample(self, n_samples=1, random_state=None):
        """
        Generate random samples from the model.
        Currently, this is implemented only for gaussian and tophat kernels,
        and the Euclidean metric.

        Parameters
        ----------
        n_samples : int, default=1
            Number of samples to generate.
        random_state : int, cupy RandomState instance or None, default=None

        Returns
        -------
        X : cupy array of shape (n_samples, n_features)
            List of samples.
        """
        if not hasattr(self, "X_"):
            raise NotFittedError()

        supported_kernels = ["gaussian", "tophat"]
        if (self.kernel not in supported_kernels
                or self.metric != "euclidean"):
            raise NotImplementedError(
                "Only {} kernels, and the euclidean"
                " metric are supported.".format(supported_kernels))

        if isinstance(random_state, cp.random.RandomState):
            rng = random_state
        else:
            rng = cp.random.RandomState(random_state)

        u = rng.uniform(0, 1, size=n_samples)
        if self.sample_weight_ is None:
            i = (u * self.X_.shape[0]).astype(np.int64)
        else:
            cumsum_weight = cp.cumsum(self.sample_weight_)
            sum_weight = cumsum_weight[-1]
            i = cp.searchsorted(cumsum_weight, u * sum_weight)
        if self.kernel == "gaussian":
            return cp.atleast_2d(rng.normal(self.X_[i], self.bandwidth))

        elif self.kernel == "tophat":
            # we first draw points from a d-dimensional normal distribution,
            # then use an incomplete gamma function to map them to a uniform
            # d-dimensional tophat distribution.
            has_scipy(raise_if_unavailable=True)
            dim = self.X_.shape[1]
            X = rng.normal(size=(n_samples, dim))
            s_sq = cp.einsum("ij,ij->i", X, X).get()

            # do this on the CPU becaause we don't have
            # a gammainc function  readily available
            correction = cp.array(
                gammainc(0.5 * dim, 0.5 * s_sq)**(1.0 / dim) * self.bandwidth /
                np.sqrt(s_sq))
            return self.X_[i] + X * correction[:, np.newaxis]
Esempio n. 2
0
    def __init__(self,
                 data=None,
                 convert_to_dtype=False,
                 convert_index=np.int32,
                 convert_format=True):
        if not cpx.scipy.sparse.isspmatrix(data) and \
                not (has_scipy() and scipy.sparse.isspmatrix(data)):
            raise ValueError("A sparse matrix is expected as input. "
                             "Received %s" % type(data))

        check_classes = [cpx.scipy.sparse.csr_matrix]
        if has_scipy():
            check_classes.append(scipy.sparse.csr_matrix)

        if not isinstance(data, tuple(check_classes)):
            if convert_format:
                debug('Received sparse matrix in %s format but CSR is '
                      'expected. Data will be converted to CSR, but this '
                      'will require additional memory copies. If this '
                      'conversion is not desired, set '
                      'set_convert_format=False to raise an exception '
                      'instead.' % type(data))
                data = data.tocsr()  # currently only CSR is supported
            else:
                raise ValueError("Expected CSR matrix but received %s" %
                                 type(data))

        if not convert_to_dtype:
            convert_to_dtype = data.dtype

        if not convert_index:
            convert_index = data.indptr.dtype

        # Note: Only 32-bit indexing is supported currently.
        # In CUDA11, Cusparse provides 64-bit function calls
        # but these are not yet used in RAFT/Cuml
        self.indptr, _, _, _ = cuml.common.input_to_cuml_array(
            data.indptr,
            check_dtype=convert_index,
            convert_to_dtype=convert_index)

        self.indices, _, _, _ = cuml.common.input_to_cuml_array(
            data.indices,
            check_dtype=convert_index,
            convert_to_dtype=convert_index)

        self.data, _, _, _ = cuml.common.input_to_cuml_array(
            data.data,
            check_dtype=data.dtype,
            convert_to_dtype=convert_to_dtype)

        self.shape = data.shape
        self.dtype = self.data.dtype
        self.nnz = data.nnz
Esempio n. 3
0
    def predict(self, X) -> CumlArray:
        """
        Perform classification on an array of test vectors X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cupy_array(X, order='K').array

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return y_hat
Esempio n. 4
0
    def predict(self, X) -> CumlArray:
        """
        Perform classification on an array of test vectors X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return y_hat
Esempio n. 5
0
    def to_output(self,
                  output_type='cupy',
                  output_format=None,
                  output_dtype=None):
        """
        Convert array to output format

        Parameters
        ----------
        output_type : string
            Format to convert the array to. Acceptable formats are:

            - 'cupy' - to cupy array
            - 'scipy' - to scipy (host) array

        output_format : string, optional { 'coo', 'csc' }
            Optionally convert the output to the specified format.
        output_dtype : string, optional
            Optionally cast the array to a specified dtype, creating
            a copy if necessary.
        """
        # Treat numpy and scipy as the same
        if (output_type == "numpy"):
            output_type = "scipy"

        output_dtype = self.data.dtype \
            if output_dtype is None else output_dtype

        if output_type not in ['cupy', 'scipy']:
            raise ValueError("Unsupported output_type: %s" % output_dtype)

        cuml_arr_output_type = 'numpy' if output_type == 'scipy' else 'cupy'

        data = self.data.to_output(cuml_arr_output_type, output_dtype)
        indices = self.indices.to_output(cuml_arr_output_type)
        indptr = self.indptr.to_output(cuml_arr_output_type)

        if output_type == 'cupy':
            constructor = cpx.scipy.sparse.csr_matrix
        elif output_type == 'scipy' and has_scipy(raise_if_unavailable=True):
            constructor = scipy.sparse.csr_matrix
        else:
            raise ValueError("Unsupported output_type: %s" % output_type)

        ret = constructor((data, indices, indptr),
                          dtype=output_dtype,
                          shape=self.shape)

        if output_format is not None:
            if output_format == 'coo':
                ret = ret.tocoo()
            elif output_format == 'csc':
                ret = ret.tocsc()
            else:
                raise ValueError("Output format %s not supported" %
                                 output_format)

        return ret
Esempio n. 6
0
def extract_knn_graph(knn_graph, convert_dtype=True, sparse=False):
    """
    Converts KNN graph from CSR, COO and CSC formats into separate
    distance and indice arrays. Input can be a cupy sparse graph (device)
    or a numpy sparse graph (host).
    """
    if has_scipy():
        from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
    else:
        from cuml.common.import_utils import DummyClass
        csr_matrix = DummyClass
        coo_matrix = DummyClass
        csc_matrix = DummyClass

    if isinstance(knn_graph, (csc_matrix, cp_csc_matrix)):
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)
        n_samples = knn_graph.shape[0]
        reordering = knn_graph.data.reshape((n_samples, -1))
        reordering = reordering.argsort()
        n_neighbors = reordering.shape[1]
        reordering += (cp.arange(n_samples) * n_neighbors)[:, np.newaxis]
        reordering = reordering.flatten()
        knn_graph.indices = knn_graph.indices[reordering]
        knn_graph.data = knn_graph.data[reordering]

    knn_indices = None
    if isinstance(knn_graph, (csr_matrix, cp_csr_matrix)):
        knn_indices = knn_graph.indices
    elif isinstance(knn_graph, (coo_matrix, cp_coo_matrix)):
        knn_indices = knn_graph.col

    if knn_indices is not None:
        convert_to_dtype = None
        if convert_dtype:
            convert_to_dtype = np.int32 if sparse else np.int64

        knn_dists = knn_graph.data
        knn_indices_m, _, _, _ = \
            input_to_cuml_array(knn_indices, order='C',
                                deepcopy=True,
                                check_dtype=(np.int64, np.int32),
                                convert_to_dtype=convert_to_dtype)

        knn_dists_m, _, _, _ = \
            input_to_cuml_array(knn_dists, order='C',
                                deepcopy=True,
                                check_dtype=np.float32,
                                convert_to_dtype=(np.float32
                                                  if convert_dtype
                                                  else None))

        return (knn_indices_m, knn_indices_m.ptr),\
            (knn_dists_m, knn_dists_m.ptr)
    return (None, None), (None, None)
Esempio n. 7
0
    def _partial_fit(self,
                     X,
                     y,
                     sample_weight=None,
                     _classes=None) -> "MultinomialNB":

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cupy_array(X, order='K').array

        y = input_to_cupy_array(y).array

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(_classes, order='K')
                check_labels(Y, _classes)
                self.classes_ = _classes
            else:
                self.classes_ = label_classes

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self.classes_)

        self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self
Esempio n. 8
0
def _sparsify_and_convert(data, input_type, sparsity_ratio=0.3):
    """Randomly set values to 0 and produce a sparse array."""
    if not has_scipy():
        raise RuntimeError("Scipy is required")
    import scipy
    random_loc = np.random.choice(data.size,
                                  int(data.size * sparsity_ratio),
                                  replace=False)
    data.ravel()[random_loc] = 0
    if input_type == 'csr':
        return scipy.sparse.csr_matrix(data)
    elif input_type == 'csc':
        return scipy.sparse.csc_matrix(data)
    else:
        TypeError('Wrong sparse input type {}'.format(input_type))
Esempio n. 9
0
def is_sparse(X):
    """
    Return true if X is sparse, false otherwise.
    Parameters
    ----------
    X : array-like, sparse-matrix

    Returns
    -------

    is_sparse : boolean
        is the input sparse?
    """
    is_scipy_sparse = (has_scipy() and scipy.sparse.isspmatrix(X))
    return cupyx.scipy.sparse.isspmatrix(X) or is_scipy_sparse
Esempio n. 10
0
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        """
        out_type = self._get_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        jll = self._joint_log_likelihood(X)

        # normalize by P(X) = P(f_1, ..., f_n)

        # Compute log(sum(exp()))

        # Subtract max in exp to prevent inf
        a_max = cp.amax(jll, axis=1, keepdims=True)

        exp = cp.exp(jll - a_max)
        logsumexp = cp.log(cp.sum(exp, axis=1))

        a_max = cp.squeeze(a_max, axis=1)

        log_prob_x = a_max + logsumexp

        if log_prob_x.ndim < 2:
            log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0]))
        result = jll - log_prob_x.T
        return CumlArray(result).to_output(out_type)
Esempio n. 11
0
    def predict_log_proba(self, X) -> CumlArray:
        """
        Return log-probability estimates for the test vector X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        jll = self._joint_log_likelihood(X)

        # normalize by P(X) = P(f_1, ..., f_n)

        # Compute log(sum(exp()))

        # Subtract max in exp to prevent inf
        a_max = cp.amax(jll, axis=1, keepdims=True)

        exp = cp.exp(jll - a_max)
        logsumexp = cp.log(cp.sum(exp, axis=1))

        a_max = cp.squeeze(a_max, axis=1)

        log_prob_x = a_max + logsumexp

        if log_prob_x.ndim < 2:
            log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0]))
        result = jll - log_prob_x.T
        return result
Esempio n. 12
0
def test_binom_coef():
    for i in range(1, 101):
        val = cuml.explainer.kernel_shap._binomCoef(100, i)
        if has_scipy():
            from scipy.special import binom
            assert math.isclose(val, binom(100, i), rel_tol=1e-15)
Esempio n. 13
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import cupyx as cpx
import numpy as np
from cuml.common.import_utils import has_scipy
from cuml.common.memory_utils import class_with_cupy_rmm
from cuml.common.logger import debug

import cuml.common

if has_scipy():
    import scipy.sparse


@class_with_cupy_rmm()
class SparseCumlArray():
    """
    SparseCumlArray abstracts sparse array allocations. This will
    accept either a Scipy or Cupy sparse array and construct CumlArrays
    out of the underlying index and data arrays. Currently, this class
    only supports the CSR array format and input in any other sparse
    format will be converted to CSR by default. Set `convert_format=False`
    to disable automatic conversion to CSR.

    Parameters
    ----------
Esempio n. 14
0
def get_supported_input_type(X):
    """
    Determines if the input object is a supported input array-like object or
    not. If supported, the type is returned. Otherwise, `None` is returned.

    Parameters
    ----------
    X : object
        Input object to test

    Notes
    -----
    To closely match the functionality of
    :func:`~cuml.common.input_utils.input_to_cuml_array`, this method will
    return ``cupy.ndarray`` for any object supporting
    `__cuda_array_interface__` and ``numpy.ndarray`` for any object supporting
    `__array_interface__`.

    Returns
    -------
    array-like type or None
        If the array-like object is supported, the type is returned.
        Otherwise, `None` is returned.
    """
    # Check CumlArray first to shorten search time
    if isinstance(X, CumlArray):
        return CumlArray

    if isinstance(X, SparseCumlArray):
        return SparseCumlArray

    if (isinstance(X, cudf.Series)):
        if X.null_count != 0:
            return None
        else:
            return cudf.Series

    # converting pandas to numpy before sending it to CumlArray
    if isinstance(X, pd.DataFrame):
        return pd.DataFrame

    if isinstance(X, pd.Series):
        return pd.Series

    if isinstance(X, cudf.DataFrame):
        return cudf.DataFrame

    if numba.cuda.devicearray.is_cuda_ndarray(X):
        return numba.cuda.devicearray.DeviceNDArrayBase

    if hasattr(X, "__cuda_array_interface__"):
        return cp.ndarray

    if hasattr(X, "__array_interface__"):
        # For some reason, numpy scalar types also implement
        # `__array_interface__`. See numpy.generic.__doc__. Exclude those types
        # as well as np.dtypes
        if (not isinstance(X, np.generic) and not isinstance(X, type)):
            return np.ndarray

    if cupyx.scipy.sparse.isspmatrix(X):
        return cupyx.scipy.sparse.spmatrix

    if has_scipy():
        if (scipy.sparse.isspmatrix(X)):
            return scipy.sparse.spmatrix

    # Return None if this type isnt supported
    return None
Esempio n. 15
0
    def _partial_fit(self,
                     X,
                     y,
                     sample_weight=None,
                     _classes=None,
                     convert_dtype=True) -> "MultinomialNB":

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
            # TODO: Expanded this since sparse kernel doesn't
            # actually require the scipy sparse container format.
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        expected_y_dtype = cp.int32 if X.dtype in [cp.float32, cp.int32
                                                   ] else cp.int64
        y = input_to_cupy_array(
            y,
            convert_to_dtype=(expected_y_dtype if convert_dtype else False),
            check_dtype=expected_y_dtype).array

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(
                    _classes,
                    order='K',
                    convert_to_dtype=(expected_y_dtype
                                      if convert_dtype else False))
                check_labels(Y, _classes)
                self.classes_ = _classes
            else:
                self.classes_ = label_classes

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self.classes_)

        if cp.sparse.isspmatrix(X):
            self._count_sparse(X.row, X.col, X.data, X.shape, Y)
        else:
            self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self