def test_merge_series(batch_size, n_obs, n_sub, dtype): """Test the helper that merges a divided batch based on division maps that track the sub-batch and position of each member """ # Generate an id tracker and compute id_to_sub and id_to_pos tracker_np = np.array_split(np.random.permutation(batch_size), n_sub) id_to_sub_np, id_to_pos_np = _build_division_map_ref( tracker_np, batch_size, n_sub) id_to_sub, *_ = input_to_cuml_array(id_to_sub_np, convert_to_dtype=np.int32) id_to_pos, *_ = input_to_cuml_array(id_to_pos_np, convert_to_dtype=np.int32) # Generate the final dataset (expected result) data_np = (np.random.uniform( -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose() # Divide the dataset according to the id tracker data_div = [] for i in range(n_sub): data_piece = np.zeros((n_obs, len(tracker_np[i])), dtype=dtype, order='F') for j in range(len(tracker_np[i])): data_piece[:, j] = data_np[:, tracker_np[i][j]] data_div.append(input_to_cuml_array(data_piece)[0]) # Call the tested function data = auto_arima._merge_series(data_div, id_to_sub, id_to_pos, batch_size) # Compare the results np.testing.assert_allclose(data.to_output("numpy"), data_np)
def extract_knn_graph(knn_graph, convert_dtype=True, sparse=False): """ Converts KNN graph from CSR, COO and CSC formats into separate distance and indice arrays. Input can be a cupy sparse graph (device) or a numpy sparse graph (host). """ if has_scipy(): from scipy.sparse import csr_matrix, coo_matrix, csc_matrix else: from cuml.common.import_utils import DummyClass csr_matrix = DummyClass coo_matrix = DummyClass csc_matrix = DummyClass if isinstance(knn_graph, (csc_matrix, cp_csc_matrix)): knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) n_samples = knn_graph.shape[0] reordering = knn_graph.data.reshape((n_samples, -1)) reordering = reordering.argsort() n_neighbors = reordering.shape[1] reordering += (cp.arange(n_samples) * n_neighbors)[:, np.newaxis] reordering = reordering.flatten() knn_graph.indices = knn_graph.indices[reordering] knn_graph.data = knn_graph.data[reordering] knn_indices = None if isinstance(knn_graph, (csr_matrix, cp_csr_matrix)): knn_indices = knn_graph.indices elif isinstance(knn_graph, (coo_matrix, cp_coo_matrix)): knn_indices = knn_graph.col if knn_indices is not None: convert_to_dtype = None if convert_dtype: convert_to_dtype = np.int32 if sparse else np.int64 knn_dists = knn_graph.data knn_indices_m, _, _, _ = \ input_to_cuml_array(knn_indices, order='C', deepcopy=True, check_dtype=(np.int64, np.int32), convert_to_dtype=convert_to_dtype) knn_dists_m, _, _, _ = \ input_to_cuml_array(knn_dists, order='C', deepcopy=True, check_dtype=np.float32, convert_to_dtype=(np.float32 if convert_dtype else None)) return (knn_indices_m, knn_indices_m.ptr),\ (knn_dists_m, knn_dists_m.ptr) return (None, None), (None, None)
def __init__(self, data=None, convert_to_dtype=False, convert_index=np.int32, convert_format=True): if not cpx.scipy.sparse.isspmatrix(data) and \ not (has_scipy() and scipy.sparse.isspmatrix(data)): raise ValueError("A sparse matrix is expected as input. " "Received %s" % type(data)) check_classes = [cpx.scipy.sparse.csr_matrix] if has_scipy(): check_classes.append(scipy.sparse.csr_matrix) if not isinstance(data, tuple(check_classes)): if convert_format: debug('Received sparse matrix in %s format but CSR is ' 'expected. Data will be converted to CSR, but this ' 'will require additional memory copies. If this ' 'conversion is not desired, set ' 'set_convert_format=False to raise an exception ' 'instead.' % type(data)) data = data.tocsr() # currently only CSR is supported else: raise ValueError("Expected CSR matrix but received %s" % type(data)) if not convert_to_dtype: convert_to_dtype = data.dtype if not convert_index: convert_index = data.indptr.dtype # Note: Only 32-bit indexing is supported currently. # In CUDA11, Cusparse provides 64-bit function calls # but these are not yet used in RAFT/Cuml self.indptr, _, _, _ = input_to_cuml_array( data.indptr, check_dtype=convert_index, convert_to_dtype=convert_index) self.indices, _, _, _ = input_to_cuml_array( data.indices, check_dtype=convert_index, convert_to_dtype=convert_index) self.data, _, _, _ = input_to_cuml_array( data.data, check_dtype=data.dtype, convert_to_dtype=convert_to_dtype) self.shape = data.shape self.dtype = self.data.dtype self.nnz = data.nnz
def assert_array_identical(a, b): cupy_a = input_to_cuml_array(a, order="K").array cupy_b = input_to_cuml_array(b, order="K").array if len(a) == 0 and len(b) == 0: return True assert cupy_a.shape == cupy_b.shape assert cupy_a.dtype == cupy_b.dtype assert cupy_a.order == cupy_b.order assert cp.all(cp.asarray(cupy_a) == cp.asarray(cupy_b)).item()
def predict(self, X): """ Perform classification on an array of test vectors X. """ out_type = self._get_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') jll = self._joint_log_likelihood(X) indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype) y_hat = invert_labels(indices, classes=self.classes_) return CumlArray(data=y_hat).to_output(out_type)
def _to_output(self, instance, to_output_type, to_output_dtype=None): existing = self._get_meta(instance, throw_on_missing=True) # Handle input_type==None which means we have a non-array object stored if (existing.input_type is None): # Dont save in the cache. Just return the value return existing.values[existing.input_type] # Return a cached value if it exists if (to_output_type in existing.values): return existing.values[to_output_type] # If the input type was anything but CumlArray, need to create one now if ("cuml" not in existing.values): existing.values["cuml"] = input_to_cuml_array( existing.get_input_value(), order="K").array cuml_arr: CumlArray = existing.values["cuml"] # Do the conversion output = cuml_arr.to_output(output_type=to_output_type, output_dtype=to_output_dtype) # Cache the value existing.values[to_output_type] = output return output
def test_build_division_map(batch_size, n_sub): """Test the helper that builds a map of the new sub-batch and position in this batch of each series in a divided batch """ # Generate the id tracker # Note: in the real use case the individual id arrays are sorted but the # helper function doesn't require that tracker_np = np.array_split(np.random.permutation(batch_size), n_sub) tracker = [ input_to_cuml_array(tr, convert_to_dtype=np.int32)[0] for tr in tracker_np ] # Call the tested function id_to_model, id_to_pos = auto_arima._build_division_map( tracker, batch_size) # Compute the expected results in pure Python id_to_model_ref, id_to_pos_ref = _build_division_map_ref( tracker_np, batch_size, n_sub) # Compare the results np.testing.assert_array_equal(id_to_model.to_output("numpy"), id_to_model_ref) np.testing.assert_array_equal(id_to_pos.to_output("numpy"), id_to_pos_ref)
def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, output_type=None, handle=None, verbose=False): super().__init__(handle=handle, verbose=verbose, output_type=output_type) self.alpha = alpha self.fit_prior = fit_prior if class_prior is not None: self._class_prior, *_ = input_to_cuml_array(class_prior) else: self._class_prior_ = None self.fit_called_ = False self._n_classes_ = 0 self._n_features_ = None # Needed until Base no longer assumed cumlHandle self.handle = None
def _partial_fit(self, X, y, sample_weight=None, _classes=None): self._set_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') y = input_to_cuml_array(y).array.to_output('cupy') Y, label_classes = make_monotonic(y, copy=True) if not self.fit_called_: self.fit_called_ = True if _classes is not None: _classes, *_ = input_to_cuml_array(_classes, order='K') check_labels(Y, _classes.to_output('cupy')) self._classes_ = _classes else: self._classes_ = CumlArray(data=label_classes) self._n_classes_ = self.classes_.shape[0] self._n_features_ = X.shape[1] self._init_counters(self._n_classes_, self._n_features_, X.dtype) else: check_labels(Y, self._classes_) self._count(X, Y) self._update_feature_log_prob(self.alpha) self._update_class_log_prior(class_prior=self._class_prior_) return self
def array_identical(a, b): cupy_a = input_to_cuml_array(a, order="K").array cupy_b = input_to_cuml_array(b, order="K").array if len(a) == 0 and len(b) == 0: return True if (cupy_a.shape != cupy_b.shape): return False if (cupy_a.dtype != cupy_b.dtype): return False if (cupy_a.order != cupy_b.order): return False return cp.all(cp.asarray(cupy_a) == cp.asarray(cupy_b)).item()
def as_type(type, *args): # Convert array args to type supported by # CumlArray.to_output ('numpy','cudf','cupy'...) # Ensure 2 dimensional inputs are not converted to 1 dimension # None remains as None # Scalar remains a scalar result = [] for arg in args: if arg is None or np.isscalar(arg): result.append(arg) else: # make sure X with a single feature remains 2 dimensional if type == 'cudf' and len(arg.shape) > 1: result.append( input_to_cuml_array(arg).array.to_output('dataframe')) else: result.append(input_to_cuml_array(arg).array.to_output(type)) if len(result) == 1: return result[0] return tuple(result)
def test_divide_by_min(batch_size, n_obs, n_sub, dtype): """Test the helper that splits a dataset by selecting the minimum of a given criterion """ # Generate random data, metrics and batch indices data_np = (np.random.uniform( -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose() crit_np = (np.random.uniform( -1.0, 1.0, (n_sub, batch_size))).astype(dtype).transpose() b_id_np = np.array(range(batch_size), dtype=np.int32) data, *_ = input_to_cuml_array(data_np) crit, *_ = input_to_cuml_array(crit_np) b_id, *_ = input_to_cuml_array(b_id_np) # Call the tested function sub_batches, sub_id = auto_arima._divide_by_min(data, crit, b_id) # Compute the expected results in pure Python which_sub = crit_np.argmin(axis=1) sub_batches_ref = [] sub_id_ref = [] for i in range(n_sub): sub_batches_ref.append(data_np[:, which_sub == i]) sub_id_ref.append(b_id_np[which_sub == i]) # Compare the results for i in range(n_sub): # First check the cases of empty sub-batches if sub_batches[i] is None: # The reference must be empty assert sub_batches_ref[i].shape[1] == 0 # And the id array must be None too assert sub_id[i] is None # When the sub-batch is not empty, compare to the reference else: np.testing.assert_allclose(sub_batches[i].to_output("numpy"), sub_batches_ref[i]) np.testing.assert_array_equal(sub_id[i].to_output("numpy"), sub_id_ref[i])
def _convert_to_gpuarray(data, order='F'): if data is None: return None elif isinstance(data, tuple): return tuple([_convert_to_gpuarray(d, order=order) for d in data]) elif isinstance(data, pd.DataFrame): return _convert_to_gpuarray(cudf.DataFrame.from_pandas(data), order=order) elif isinstance(data, pd.Series): gs = cudf.Series.from_pandas(data) return cuda.as_cuda_array(gs) else: return input_utils.input_to_cuml_array( data, order=order)[0].to_output("numba")
def test_divide_by_mask(batch_size, n_obs, prop_true, dtype): """Test the helper that splits a dataset in 2 based on a boolean mask """ # Generate random data, mask and batch indices data_np = (np.random.uniform( -1.0, 1.0, (batch_size, n_obs))).astype(dtype).transpose() nb_true = int(prop_true * batch_size) mask_np = np.random.permutation([False] * (batch_size - nb_true) + [True] * nb_true) b_id_np = np.array(range(batch_size), dtype=np.int32) data, *_ = input_to_cuml_array(data_np) mask, *_ = input_to_cuml_array(mask_np) b_id, *_ = input_to_cuml_array(b_id_np) # Call the tested function sub_data, sub_id = [None, None], [None, None] sub_data[0], sub_id[0], sub_data[1], sub_id[1] = \ auto_arima._divide_by_mask(data, mask, b_id) # Compute the expected results in pure Python sub_data_ref = [data_np[:, np.logical_not(mask_np)], data_np[:, mask_np]] sub_id_ref = [b_id_np[np.logical_not(mask_np)], b_id_np[mask_np]] # Compare the results for i in range(2): # First check the cases of empty sub-batches if sub_data[i] is None: # The reference must be empty assert sub_data_ref[i].shape[1] == 0 # And the id array must be None too assert sub_id[i] is None # When the sub-batch is not empty, compare to the reference else: np.testing.assert_allclose(sub_data[i].to_output("numpy"), sub_data_ref[i]) np.testing.assert_array_equal(sub_id[i].to_output("numpy"), sub_id_ref[i])
def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. """ out_type = self._get_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') jll = self._joint_log_likelihood(X) # normalize by P(X) = P(f_1, ..., f_n) # Compute log(sum(exp())) # Subtract max in exp to prevent inf a_max = cp.amax(jll, axis=1, keepdims=True) exp = cp.exp(jll - a_max) logsumexp = cp.log(cp.sum(exp, axis=1)) a_max = cp.squeeze(a_max, axis=1) log_prob_x = a_max + logsumexp if log_prob_x.ndim < 2: log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0])) result = jll - log_prob_x.T return CumlArray(result).to_output(out_type)
def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, output_type=None, handle=None): """ Create new multinomial Naive Bayes instance Parameters ---------- alpha : float Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). fit_prior : boolean Whether to learn class prior probabilities or no. If false, a uniform prior will be used. class_prior : array-like, size (n_classes) Prior probabilities of the classes. If specified, the priors are not adjusted according to the data. """ super(MultinomialNB, self).__init__(handle=handle, output_type=output_type) self.alpha = alpha self.fit_prior = fit_prior if class_prior is not None: self._class_prior, *_ = input_to_cuml_array(class_prior) else: self._class_prior_ = None self.fit_called_ = False self._n_classes_ = 0 self._n_features_ = None # Needed until Base no longer assumed cumlHandle self.handle = None
def score_samples(self, X): """Compute the log-likelihood of each sample under the model. Parameters ---------- X : array-like of shape (n_samples, n_features) An array of points to query. Last dimension should match dimension of training data (n_features). Returns ------- density : ndarray of shape (n_samples,) Log-likelihood of each sample in `X`. These are normalized to be probability densities, so values will be low for high-dimensional data. """ if not hasattr(self, "X_"): raise NotFittedError() X_cuml = input_to_cuml_array(X) if self.metric_params: if len(self.metric_params) != 1: raise ValueError( "Cuml only supports metrics with a single arg.") metric_arg = list(self.metric_params.values())[0] distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric, metric_arg=metric_arg) else: distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric) distances = cp.asarray(distances) h = self.bandwidth if self.kernel in log_probability_kernels_: distances = log_probability_kernels_[self.kernel](distances, h) else: raise ValueError("Unsupported kernel.") log_probabilities = cp.zeros(distances.shape[0]) if self.sample_weight_ is not None: distances += cp.log(self.sample_weight_) logsumexp_kernel.forall(log_probabilities.size)(distances, log_probabilities) # Note that sklearns user guide is wrong # It says the (unnormalised) probability output for # the kernel density is sum(K(x,h)). # In fact what they implment is (1/n)*sum(K(x,h)) # Here we divide by n in normal probability space # Which becomes -log(n) in log probability space sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_ is not None else distances.shape[1]) log_probabilities -= np.log(sum_weights) # norm if len(X_cuml.array.shape) == 1: # if X is one dimensional, we have 1 feature dimension = 1 else: dimension = X_cuml.array.shape[1] log_probabilities = norm_log_probabilities(log_probabilities, self.kernel, h, dimension) return log_probabilities
def create_output(X_in, output_type): cuml_ary_tuple = input_to_cuml_array(X_in, order="K") return cuml_ary_tuple.array.to_output(output_type)
def _partial_fit(self, X, y, sample_weight=None, _classes=None, convert_dtype=True) -> "MultinomialNB": if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = _convert_x_sparse(X) # TODO: Expanded this since sparse kernel doesn't # actually require the scipy sparse container format. else: X = input_to_cupy_array( X, order='K', check_dtype=[cp.float32, cp.float64, cp.int32]).array expected_y_dtype = cp.int32 if X.dtype in [cp.float32, cp.int32 ] else cp.int64 y = input_to_cupy_array( y, convert_to_dtype=(expected_y_dtype if convert_dtype else False), check_dtype=expected_y_dtype).array Y, label_classes = make_monotonic(y, copy=True) if not self.fit_called_: self.fit_called_ = True if _classes is not None: _classes, *_ = input_to_cuml_array( _classes, order='K', convert_to_dtype=(expected_y_dtype if convert_dtype else False)) check_labels(Y, _classes) self.classes_ = _classes else: self.classes_ = label_classes self._n_classes_ = self.classes_.shape[0] self._n_features_ = X.shape[1] self._init_counters(self._n_classes_, self._n_features_, X.dtype) else: check_labels(Y, self.classes_) if cp.sparse.isspmatrix(X): self._count_sparse(X.row, X.col, X.data, X.shape, Y) else: self._count(X, Y) self._update_feature_log_prob(self.alpha) self._update_class_log_prior(class_prior=self._class_prior_) return self