Example #1
0
def test_merge_series(batch_size, n_obs, n_sub, dtype):
    """Test the helper that merges a divided batch based on division maps
    that track the sub-batch and position of each member
    """
    # Generate an id tracker and compute id_to_sub and id_to_pos
    tracker_np = np.array_split(np.random.permutation(batch_size), n_sub)
    id_to_sub_np, id_to_pos_np = _build_division_map_ref(
        tracker_np, batch_size, n_sub)
    id_to_sub, *_ = input_to_cuml_array(id_to_sub_np,
                                        convert_to_dtype=np.int32)
    id_to_pos, *_ = input_to_cuml_array(id_to_pos_np,
                                        convert_to_dtype=np.int32)

    # Generate the final dataset (expected result)
    data_np = (np.random.uniform(
        -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose()

    # Divide the dataset according to the id tracker
    data_div = []
    for i in range(n_sub):
        data_piece = np.zeros((n_obs, len(tracker_np[i])),
                              dtype=dtype,
                              order='F')
        for j in range(len(tracker_np[i])):
            data_piece[:, j] = data_np[:, tracker_np[i][j]]
        data_div.append(input_to_cuml_array(data_piece)[0])

    # Call the tested function
    data = auto_arima._merge_series(data_div, id_to_sub, id_to_pos, batch_size)

    # Compare the results
    np.testing.assert_allclose(data.to_output("numpy"), data_np)
Example #2
0
def extract_knn_graph(knn_graph, convert_dtype=True, sparse=False):
    """
    Converts KNN graph from CSR, COO and CSC formats into separate
    distance and indice arrays. Input can be a cupy sparse graph (device)
    or a numpy sparse graph (host).
    """
    if has_scipy():
        from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
    else:
        from cuml.common.import_utils import DummyClass
        csr_matrix = DummyClass
        coo_matrix = DummyClass
        csc_matrix = DummyClass

    if isinstance(knn_graph, (csc_matrix, cp_csc_matrix)):
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)
        n_samples = knn_graph.shape[0]
        reordering = knn_graph.data.reshape((n_samples, -1))
        reordering = reordering.argsort()
        n_neighbors = reordering.shape[1]
        reordering += (cp.arange(n_samples) * n_neighbors)[:, np.newaxis]
        reordering = reordering.flatten()
        knn_graph.indices = knn_graph.indices[reordering]
        knn_graph.data = knn_graph.data[reordering]

    knn_indices = None
    if isinstance(knn_graph, (csr_matrix, cp_csr_matrix)):
        knn_indices = knn_graph.indices
    elif isinstance(knn_graph, (coo_matrix, cp_coo_matrix)):
        knn_indices = knn_graph.col

    if knn_indices is not None:
        convert_to_dtype = None
        if convert_dtype:
            convert_to_dtype = np.int32 if sparse else np.int64

        knn_dists = knn_graph.data
        knn_indices_m, _, _, _ = \
            input_to_cuml_array(knn_indices, order='C',
                                deepcopy=True,
                                check_dtype=(np.int64, np.int32),
                                convert_to_dtype=convert_to_dtype)

        knn_dists_m, _, _, _ = \
            input_to_cuml_array(knn_dists, order='C',
                                deepcopy=True,
                                check_dtype=np.float32,
                                convert_to_dtype=(np.float32
                                                  if convert_dtype
                                                  else None))

        return (knn_indices_m, knn_indices_m.ptr),\
            (knn_dists_m, knn_dists_m.ptr)
    return (None, None), (None, None)
Example #3
0
    def __init__(self,
                 data=None,
                 convert_to_dtype=False,
                 convert_index=np.int32,
                 convert_format=True):
        if not cpx.scipy.sparse.isspmatrix(data) and \
                not (has_scipy() and scipy.sparse.isspmatrix(data)):
            raise ValueError("A sparse matrix is expected as input. "
                             "Received %s" % type(data))

        check_classes = [cpx.scipy.sparse.csr_matrix]
        if has_scipy():
            check_classes.append(scipy.sparse.csr_matrix)

        if not isinstance(data, tuple(check_classes)):
            if convert_format:
                debug('Received sparse matrix in %s format but CSR is '
                      'expected. Data will be converted to CSR, but this '
                      'will require additional memory copies. If this '
                      'conversion is not desired, set '
                      'set_convert_format=False to raise an exception '
                      'instead.' % type(data))
                data = data.tocsr()  # currently only CSR is supported
            else:
                raise ValueError("Expected CSR matrix but received %s" %
                                 type(data))

        if not convert_to_dtype:
            convert_to_dtype = data.dtype

        if not convert_index:
            convert_index = data.indptr.dtype

        # Note: Only 32-bit indexing is supported currently.
        # In CUDA11, Cusparse provides 64-bit function calls
        # but these are not yet used in RAFT/Cuml
        self.indptr, _, _, _ = input_to_cuml_array(
            data.indptr,
            check_dtype=convert_index,
            convert_to_dtype=convert_index)

        self.indices, _, _, _ = input_to_cuml_array(
            data.indices,
            check_dtype=convert_index,
            convert_to_dtype=convert_index)

        self.data, _, _, _ = input_to_cuml_array(
            data.data,
            check_dtype=data.dtype,
            convert_to_dtype=convert_to_dtype)

        self.shape = data.shape
        self.dtype = self.data.dtype
        self.nnz = data.nnz
Example #4
0
def assert_array_identical(a, b):

    cupy_a = input_to_cuml_array(a, order="K").array
    cupy_b = input_to_cuml_array(b, order="K").array

    if len(a) == 0 and len(b) == 0:
        return True

    assert cupy_a.shape == cupy_b.shape
    assert cupy_a.dtype == cupy_b.dtype
    assert cupy_a.order == cupy_b.order
    assert cp.all(cp.asarray(cupy_a) == cp.asarray(cupy_b)).item()
Example #5
0
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        """
        out_type = self._get_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return CumlArray(data=y_hat).to_output(out_type)
Example #6
0
    def _to_output(self, instance, to_output_type, to_output_dtype=None):

        existing = self._get_meta(instance, throw_on_missing=True)

        # Handle input_type==None which means we have a non-array object stored
        if (existing.input_type is None):
            # Dont save in the cache. Just return the value
            return existing.values[existing.input_type]

        # Return a cached value if it exists
        if (to_output_type in existing.values):
            return existing.values[to_output_type]

        # If the input type was anything but CumlArray, need to create one now
        if ("cuml" not in existing.values):
            existing.values["cuml"] = input_to_cuml_array(
                existing.get_input_value(), order="K").array

        cuml_arr: CumlArray = existing.values["cuml"]

        # Do the conversion
        output = cuml_arr.to_output(output_type=to_output_type,
                                    output_dtype=to_output_dtype)

        # Cache the value
        existing.values[to_output_type] = output

        return output
Example #7
0
def test_build_division_map(batch_size, n_sub):
    """Test the helper that builds a map of the new sub-batch and position
    in this batch of each series in a divided batch
    """
    # Generate the id tracker
    # Note: in the real use case the individual id arrays are sorted but the
    # helper function doesn't require that
    tracker_np = np.array_split(np.random.permutation(batch_size), n_sub)
    tracker = [
        input_to_cuml_array(tr, convert_to_dtype=np.int32)[0]
        for tr in tracker_np
    ]

    # Call the tested function
    id_to_model, id_to_pos = auto_arima._build_division_map(
        tracker, batch_size)

    # Compute the expected results in pure Python
    id_to_model_ref, id_to_pos_ref = _build_division_map_ref(
        tracker_np, batch_size, n_sub)

    # Compare the results
    np.testing.assert_array_equal(id_to_model.to_output("numpy"),
                                  id_to_model_ref)
    np.testing.assert_array_equal(id_to_pos.to_output("numpy"), id_to_pos_ref)
Example #8
0
    def __init__(self,
                 *,
                 alpha=1.0,
                 fit_prior=True,
                 class_prior=None,
                 output_type=None,
                 handle=None,
                 verbose=False):
        super().__init__(handle=handle,
                         verbose=verbose,
                         output_type=output_type)
        self.alpha = alpha
        self.fit_prior = fit_prior

        if class_prior is not None:
            self._class_prior, *_ = input_to_cuml_array(class_prior)
        else:
            self._class_prior_ = None

        self.fit_called_ = False
        self._n_classes_ = 0
        self._n_features_ = None

        # Needed until Base no longer assumed cumlHandle
        self.handle = None
Example #9
0
    def _partial_fit(self, X, y, sample_weight=None, _classes=None):
        self._set_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        y = input_to_cuml_array(y).array.to_output('cupy')

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(_classes, order='K')
                check_labels(Y, _classes.to_output('cupy'))
                self._classes_ = _classes
            else:
                self._classes_ = CumlArray(data=label_classes)

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self._classes_)

        self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self
Example #10
0
def array_identical(a, b):

    cupy_a = input_to_cuml_array(a, order="K").array
    cupy_b = input_to_cuml_array(b, order="K").array

    if len(a) == 0 and len(b) == 0:
        return True

    if (cupy_a.shape != cupy_b.shape):
        return False

    if (cupy_a.dtype != cupy_b.dtype):
        return False

    if (cupy_a.order != cupy_b.order):
        return False

    return cp.all(cp.asarray(cupy_a) == cp.asarray(cupy_b)).item()
Example #11
0
def as_type(type, *args):
    # Convert array args to type supported by
    # CumlArray.to_output ('numpy','cudf','cupy'...)
    # Ensure 2 dimensional inputs are not converted to 1 dimension
    # None remains as None
    # Scalar remains a scalar
    result = []
    for arg in args:
        if arg is None or np.isscalar(arg):
            result.append(arg)
        else:
            # make sure X with a single feature remains 2 dimensional
            if type == 'cudf' and len(arg.shape) > 1:
                result.append(
                    input_to_cuml_array(arg).array.to_output('dataframe'))
            else:
                result.append(input_to_cuml_array(arg).array.to_output(type))
    if len(result) == 1:
        return result[0]
    return tuple(result)
Example #12
0
def test_divide_by_min(batch_size, n_obs, n_sub, dtype):
    """Test the helper that splits a dataset by selecting the minimum
    of a given criterion
    """
    # Generate random data, metrics and batch indices
    data_np = (np.random.uniform(
        -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose()
    crit_np = (np.random.uniform(
        -1.0, 1.0, (n_sub, batch_size))).astype(dtype).transpose()
    b_id_np = np.array(range(batch_size), dtype=np.int32)
    data, *_ = input_to_cuml_array(data_np)
    crit, *_ = input_to_cuml_array(crit_np)
    b_id, *_ = input_to_cuml_array(b_id_np)

    # Call the tested function
    sub_batches, sub_id = auto_arima._divide_by_min(data, crit, b_id)

    # Compute the expected results in pure Python
    which_sub = crit_np.argmin(axis=1)
    sub_batches_ref = []
    sub_id_ref = []
    for i in range(n_sub):
        sub_batches_ref.append(data_np[:, which_sub == i])
        sub_id_ref.append(b_id_np[which_sub == i])

    # Compare the results
    for i in range(n_sub):
        # First check the cases of empty sub-batches
        if sub_batches[i] is None:
            # The reference must be empty
            assert sub_batches_ref[i].shape[1] == 0
            # And the id array must be None too
            assert sub_id[i] is None
        # When the sub-batch is not empty, compare to the reference
        else:
            np.testing.assert_allclose(sub_batches[i].to_output("numpy"),
                                       sub_batches_ref[i])
            np.testing.assert_array_equal(sub_id[i].to_output("numpy"),
                                          sub_id_ref[i])
Example #13
0
def _convert_to_gpuarray(data, order='F'):
    if data is None:
        return None
    elif isinstance(data, tuple):
        return tuple([_convert_to_gpuarray(d, order=order) for d in data])
    elif isinstance(data, pd.DataFrame):
        return _convert_to_gpuarray(cudf.DataFrame.from_pandas(data),
                                    order=order)
    elif isinstance(data, pd.Series):
        gs = cudf.Series.from_pandas(data)
        return cuda.as_cuda_array(gs)
    else:
        return input_utils.input_to_cuml_array(
            data, order=order)[0].to_output("numba")
Example #14
0
def test_divide_by_mask(batch_size, n_obs, prop_true, dtype):
    """Test the helper that splits a dataset in 2 based on a boolean mask
    """
    # Generate random data, mask and batch indices
    data_np = (np.random.uniform(
        -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose()
    nb_true = int(prop_true * batch_size)
    mask_np = np.random.permutation([False] * (batch_size - nb_true) +
                                    [True] * nb_true)
    b_id_np = np.array(range(batch_size), dtype=np.int32)
    data, *_ = input_to_cuml_array(data_np)
    mask, *_ = input_to_cuml_array(mask_np)
    b_id, *_ = input_to_cuml_array(b_id_np)

    # Call the tested function
    sub_data, sub_id = [None, None], [None, None]
    sub_data[0], sub_id[0], sub_data[1], sub_id[1] = \
        auto_arima._divide_by_mask(data, mask, b_id)

    # Compute the expected results in pure Python
    sub_data_ref = [data_np[:, np.logical_not(mask_np)], data_np[:, mask_np]]
    sub_id_ref = [b_id_np[np.logical_not(mask_np)], b_id_np[mask_np]]

    # Compare the results
    for i in range(2):
        # First check the cases of empty sub-batches
        if sub_data[i] is None:
            # The reference must be empty
            assert sub_data_ref[i].shape[1] == 0
            # And the id array must be None too
            assert sub_id[i] is None
        # When the sub-batch is not empty, compare to the reference
        else:
            np.testing.assert_allclose(sub_data[i].to_output("numpy"),
                                       sub_data_ref[i])
            np.testing.assert_array_equal(sub_id[i].to_output("numpy"),
                                          sub_id_ref[i])
Example #15
0
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        """
        out_type = self._get_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        jll = self._joint_log_likelihood(X)

        # normalize by P(X) = P(f_1, ..., f_n)

        # Compute log(sum(exp()))

        # Subtract max in exp to prevent inf
        a_max = cp.amax(jll, axis=1, keepdims=True)

        exp = cp.exp(jll - a_max)
        logsumexp = cp.log(cp.sum(exp, axis=1))

        a_max = cp.squeeze(a_max, axis=1)

        log_prob_x = a_max + logsumexp

        if log_prob_x.ndim < 2:
            log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0]))
        result = jll - log_prob_x.T
        return CumlArray(result).to_output(out_type)
Example #16
0
    def __init__(self,
                 alpha=1.0,
                 fit_prior=True,
                 class_prior=None,
                 output_type=None,
                 handle=None):
        """
        Create new multinomial Naive Bayes instance

        Parameters
        ----------

        alpha : float Additive (Laplace/Lidstone) smoothing parameter (0 for
                no smoothing).
        fit_prior : boolean Whether to learn class prior probabilities or no.
                    If false, a uniform prior will be used.
        class_prior : array-like, size (n_classes) Prior probabilities of the
                      classes. If specified, the priors are not adjusted
                      according to the data.
        """
        super(MultinomialNB, self).__init__(handle=handle,
                                            output_type=output_type)
        self.alpha = alpha
        self.fit_prior = fit_prior

        if class_prior is not None:
            self._class_prior, *_ = input_to_cuml_array(class_prior)
        else:
            self._class_prior_ = None

        self.fit_called_ = False
        self._n_classes_ = 0
        self._n_features_ = None

        # Needed until Base no longer assumed cumlHandle
        self.handle = None
Example #17
0
    def score_samples(self, X):
        """Compute the log-likelihood of each sample under the model.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            An array of points to query.  Last dimension should match dimension
            of training data (n_features).

        Returns
        -------

        density : ndarray of shape (n_samples,)
            Log-likelihood of each sample in `X`. These are normalized to be
            probability densities, so values will be low for high-dimensional
            data.
        """
        if not hasattr(self, "X_"):
            raise NotFittedError()
        X_cuml = input_to_cuml_array(X)
        if self.metric_params:
            if len(self.metric_params) != 1:
                raise ValueError(
                    "Cuml only supports metrics with a single arg.")
            metric_arg = list(self.metric_params.values())[0]
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric,
                                           metric_arg=metric_arg)
        else:
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric)

        distances = cp.asarray(distances)

        h = self.bandwidth
        if self.kernel in log_probability_kernels_:
            distances = log_probability_kernels_[self.kernel](distances, h)
        else:
            raise ValueError("Unsupported kernel.")

        log_probabilities = cp.zeros(distances.shape[0])
        if self.sample_weight_ is not None:
            distances += cp.log(self.sample_weight_)

        logsumexp_kernel.forall(log_probabilities.size)(distances,
                                                        log_probabilities)
        # Note that sklearns user guide is wrong
        # It says the (unnormalised) probability output for
        #  the kernel density is sum(K(x,h)).
        # In fact what they implment is (1/n)*sum(K(x,h))
        # Here we divide by n in normal probability space
        # Which becomes -log(n) in log probability space
        sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_
                       is not None else distances.shape[1])
        log_probabilities -= np.log(sum_weights)

        # norm
        if len(X_cuml.array.shape) == 1:
            # if X is one dimensional, we have 1 feature
            dimension = 1
        else:
            dimension = X_cuml.array.shape[1]
        log_probabilities = norm_log_probabilities(log_probabilities,
                                                   self.kernel, h, dimension)

        return log_probabilities
Example #18
0
def create_output(X_in, output_type):

    cuml_ary_tuple = input_to_cuml_array(X_in, order="K")

    return cuml_ary_tuple.array.to_output(output_type)
Example #19
0
    def _partial_fit(self,
                     X,
                     y,
                     sample_weight=None,
                     _classes=None,
                     convert_dtype=True) -> "MultinomialNB":

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
            # TODO: Expanded this since sparse kernel doesn't
            # actually require the scipy sparse container format.
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        expected_y_dtype = cp.int32 if X.dtype in [cp.float32, cp.int32
                                                   ] else cp.int64
        y = input_to_cupy_array(
            y,
            convert_to_dtype=(expected_y_dtype if convert_dtype else False),
            check_dtype=expected_y_dtype).array

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(
                    _classes,
                    order='K',
                    convert_to_dtype=(expected_y_dtype
                                      if convert_dtype else False))
                check_labels(Y, _classes)
                self.classes_ = _classes
            else:
                self.classes_ = label_classes

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self.classes_)

        if cp.sparse.isspmatrix(X):
            self._count_sparse(X.row, X.col, X.data, X.shape, Y)
        else:
            self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self