def fit(self, y): """ Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ if y.ndim > 2: raise ValueError("labels cannot be greater than 2 dimensions") if y.ndim == 2: unique_classes = rmm_cupy_ary(cp.unique, y) if unique_classes != [0, 1]: raise ValueError("2-d array can must be binary") self.classes_ = rmm_cupy_ary(cp.arange, 0, y.shape[1]) else: self.classes_ = rmm_cupy_ary(cp.unique, y).astype(y.dtype) cp.cuda.Stream.null.synchronize() return self
def inverse_transform(self, y, threshold=None): """ Transform binary labels back to original multi-class labels Parameters ---------- y : array of shape [n_samples, n_classes] threshold : float this value is currently ignored Returns ------- arr : array with original labels """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # If we are already given multi-class, just return it. if cupyx.scipy.sparse.isspmatrix(y): y_mapped = y.tocsr().indices.astype(self._classes_.dtype) elif scipy_sparse_isspmatrix(y): y = y.tocsr() y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype) else: y_mapped = rmm_cupy_ary(cp.argmax, rmm_cupy_ary(cp.asarray, y, dtype=y.dtype), axis=1).astype(y.dtype) return invert_labels(y_mapped, self._classes_)
def fit(self, y): """Fit label binarizer Parameters ---------- y : Dask.Array of shape [n_samples,] or [n_samples, n_classes] chunked by row. Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ # Take the unique classes and broadcast them all around the cluster. futures = self.client.sync(_extract_partitions, y) unique = [ self.client.submit(LabelBinarizer._func_unique_classes, f) for w, f in futures ] classes = self.client.compute(unique, True) classes = rmm_cupy_ary(cp.unique, rmm_cupy_ary(cp.stack, classes, axis=0)) self._set_internal_model(LB(**self.kwargs).fit(classes)) return self
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = rmm_cupy_ary(cp.asarray, X) X_m = X_m.astype(to_dtype) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type: %s" % type(X)) return X
def _conv_array_to_sparse(arr): """ Converts an array (or cudf.DataFrame) to a sparse array :param arr: scipy or cupy sparse matrix, cudf DataFrame, dense numpy or cupy array :return: cupy sparse CSR matrix """ if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix if scipy_sparse_isspmatrix(arr): ret = \ cupyx.scipy.sparse.csr_matrix(arr.tocsr()) elif cupyx.scipy.sparse.isspmatrix(arr): ret = arr elif isinstance(arr, cudf.DataFrame): ret = _conv_df_to_sparse(arr) elif isinstance(arr, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype) ret = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(arr, cp.core.core.ndarray): ret = cupyx.scipy.sparse.csr_matrix(arr) else: raise ValueError("Unexpected input type %s" % type(arr)) return ret
def inverse_transform(self, y, threshold=None): """ Invert a set of encoded labels back to original labels Parameters ---------- y : Dask.Array of shape [n_samples, n_classes] containing encoded labels threshold : float This value is currently ignored Returns ------- arr : Dask.Array backed by CuPy arrays containing original labels """ parts = self.client.sync(_extract_partitions, y) inv_func = dask.delayed(LabelBinarizer._func_inv_xform) dtype = self.classes_.dtype meta = rmm_cupy_ary(cp.zeros, 1, dtype=dtype) internal_model = self._get_internal_model() f = [ dask.array.from_delayed(inv_func(internal_model, part, threshold), dtype=dtype, shape=(y.shape[0], ), meta=meta) for w, part in parts ] ret = dask.array.stack(f, axis=0) return ret.reshape(ret.shape[1:])
def transform(self, y): """ Transform and return encoded labels Parameters ---------- y : Dask.Array of shape [n_samples,] or [n_samples, n_classes] Returns ------- arr : Dask.Array backed by CuPy arrays containing encoded labels """ parts = self.client.sync(_extract_partitions, y) internal_model = self._get_internal_model() xform_func = dask.delayed(LabelBinarizer._func_xform) meta = rmm_cupy_ary(cp.zeros, 1) if internal_model.sparse_output: meta = cupyx.scipy.sparse.csr_matrix(meta) f = [ dask.array.from_delayed(xform_func(internal_model, part), meta=meta, dtype=cp.float32, shape=(len(y), len(self.classes_))) for w, part in parts ] arr = dask.array.asarray(f) return arr.reshape(arr.shape[1:])
def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): """ A stateless helper function to dummy encode multi-class labels. Parameters ---------- y : array-like of size [n_samples,] or [n_samples, n_classes] classes : the set of unique classes in the input neg_label : integer the negative value for transformed output pos_label : integer the positive value for transformed output sparse_output : bool whether to return sparse array """ classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype) labels = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) if not check_labels(labels, classes): raise ValueError("Unseen classes encountered in input") row_ind = rmm_cupy_ary(cp.arange, 0, labels.shape[0], 1, dtype=y.dtype) col_ind, _ = make_monotonic(labels, classes, copy=True) val = rmm_cupy_ary(cp.full, row_ind.shape[0], pos_label, dtype=y.dtype) sp = cupyx.scipy.sparse.coo_matrix( (val, (row_ind, col_ind)), shape=(col_ind.shape[0], classes.shape[0]), dtype=cp.float32) cp.cuda.Stream.null.synchronize() if sparse_output: sp = sp.tocsr() return sp else: arr = sp.toarray().astype(y.dtype) arr[arr == 0] = neg_label return arr
def _func_inv_xform(model, y, threshold): y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return model.inverse_transform(y, threshold)
def _func_xform(model, y): xform_in = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return model.transform(xform_in)
def _func_unique_classes(y): return rmm_cupy_ary(cp.unique, y)
def _unique(x): return rmm_cupy_ary(cp.unique, x)
def _count_accurate_predictions(y_hat, y): y_hat = rmm_cupy_ary(cp.asarray, y_hat, dtype=y_hat.dtype) y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return y.shape[0] - cp.count_nonzero(y - y_hat)
def _conv_df_to_sparse(x): cupy_ary = rmm_cupy_ary(cp.asarray, x.to_cupy(), dtype=x.dtypes[0]) return cupyx.scipy.sparse.csr_matrix(cupy_ary)
def _conv_df_to_sparse(x): cupy_ary = rmm_cupy_ary(cp.asarray, x.as_gpu_matrix(), dtype=x.dtypes[0]) return cp.sparse.csr_matrix(cupy_ary)
def _conv_np_to_df(x): cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype) return cudf.DataFrame.from_gpu_matrix(cupy_ary)
def input_to_cuml_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to CumlArray. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: 'F', 'C' or 'K' (default: 'F') Whether to return a F-major ('F'), C-major ('C') array or Keep ('K') the order of X. Used to check the order of the input. If fail_on_order=True, the method will raise ValueError, otherwise it will convert X to be of order `order` if needed. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype') A new CumlArray and associated data. """ # dtype conversion if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False # format conversion if (isinstance(X, cudf.Series)): if X.null_count != 0: raise ValueError("Error: cuDF Series has missing/null values, " + " which are not supported by cuML.") # converting pandas to numpy before sending it to CumlArray if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): # pandas doesn't support custom order in to_numpy X = cp.asarray(X.to_numpy(copy=False), order=order) if isinstance(X, cudf.DataFrame): if order == 'K': X_m = CumlArray(data=X.as_gpu_matrix(order='F')) else: X_m = CumlArray(data=X.as_gpu_matrix(order=order)) elif isinstance(X, CumlArray): X_m = X elif hasattr(X, "__array_interface__") or \ hasattr(X, "__cuda_array_interface__"): X_m = CumlArray(data=X) if deepcopy: X_m = copy.deepcopy(X_m) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) if check_dtype: if not isinstance(check_dtype, list): check_dtype = [check_dtype] check_dtype = [np.dtype(dtype) for dtype in check_dtype] if X_m.dtype not in check_dtype: type_str = X_m.dtype del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(type_str)) # Checks based on parameters n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if n_cols == 1 or n_rows == 1: order = 'K' if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if order != 'K' and X_m.order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = CumlArray(data=X_m) return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
def to_sparse_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) ret = cudf_or_array # If we have a Dask array, convert it to a Dask DataFrame if isinstance(ret, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. def _conv_np_to_df(x): cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype) return cudf.DataFrame.from_gpu_matrix(cupy_ary) parts = client.sync(_extract_partitions, ret) futures = [ client.submit(_conv_np_to_df, part, workers=[w], pure=False) for w, part in parts ] ret = df_to_dask_cudf(futures) # If we have a Dask Dataframe, use `map_partitions` to convert it # to a Sparse Cupy-backed Dask Array. This will also convert the dense # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot # use map_blocks on the array, but we can use `map_partitions` on the # Dataframe. if isinstance(ret, dask.dataframe.DataFrame): ret = ret.map_partitions(_conv_df_to_sparse, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return ret else: ret = _conv_array_to_sparse(ret) # Push to worker final_result = client.scatter(ret) return dask.array.from_delayed(final_result, shape=shape, meta=meta)
def train_test_split(X, y, test_size: Union[float, int] = None, train_size: Union[float, int] = None, shuffle: bool = True, random_state: Union[int, cp.random.RandomState, np.random.RandomState] = None, seed: Union[int, cp.random.RandomState, np.random.RandomState] = None): """ Partitions device data into four collated objects, mimicking Scikit-learn's `train_test_split` Parameters ---------- X : cudf.DataFrame or cuda_array_interface compliant device array Data to split, has shape (n_samples, n_features) y : str, cudf.Series or cuda_array_interface compliant device array Set of labels for the data, either a series of shape (n_samples) or the string label of a column in X (if it is a cuDF DataFrame) containing the labels train_size : float or int, optional If float, represents the proportion [0, 1] of the data to be assigned to the training set. If an int, represents the number of instances to be assigned to the training set. Defaults to 0.8 shuffle : bool, optional Whether or not to shuffle inputs before splitting random_state : int, CuPy RandomState or NumPy RandomState optional If shuffle is true, seeds the generator. Unseeded by default seed: random_state : int, CuPy RandomState or NumPy RandomState optional Deprecated in favor of `random_state`. If shuffle is true, seeds the generator. Unseeded by default Examples -------- .. code-block:: python import cudf from cuml.preprocessing.model_selection import train_test_split # Generate some sample data df = cudf.DataFrame({'x': range(10), 'y': [0, 1] * 5}) print(f'Original data: {df.shape[0]} elements') # Suppose we want an 80/20 split X_train, X_test, y_train, y_test = train_test_split(df, 'y', train_size=0.8) print(f'X_train: {X_train.shape[0]} elements') print(f'X_test: {X_test.shape[0]} elements') print(f'y_train: {y_train.shape[0]} elements') print(f'y_test: {y_test.shape[0]} elements') # Alternatively, if our labels are stored separately labels = df['y'] df = df.drop(['y']) # we can also do X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8) Output: .. code-block:: python Original data: 10 elements X_train: 8 elements X_test: 2 elements y_train: 8 elements y_test: 2 elements Returns ------- X_train, X_test, y_train, y_test : cudf.DataFrame Partitioned dataframes. If `y` was provided as a column name, the column was dropped from the `X`s """ if isinstance(y, str): # Use the column with name `str` as y if isinstance(X, cudf.DataFrame): name = y y = X[name] X = X.drop(name) else: raise TypeError("X needs to be a cuDF Dataframe when y is a \ string") # todo: this check will be replaced with upcoming improvements # to input_utils with PR #1379 if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("X needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("y needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same first dimension" "(found {} and {})".format(X.shape[0], y.shape[0])) if isinstance(train_size, float): if not 0 <= train_size <= 1: raise ValueError("proportion train_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" "first dimension of X (found {})".format(train_size)) if isinstance(test_size, float): if not 0 <= test_size <= 1: raise ValueError("proportion test_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(test_size, int): if not 0 <= test_size <= X.shape[0]: raise ValueError( "Number of instances test_size should be between 0 and the" "first dimension of X (found {})".format(test_size)) x_numba = False y_numba = False if seed is not None: if random_state is None: warnings.warn("Parameter 'seed' is deprecated, please use \ 'random_state' instead.") random_state = seed else: warnings.warn("Both 'seed' and 'random_state' parameters were \ set, using 'random_state' since 'seed' is \ deprecated. ") if shuffle: if random_state is None or isinstance(random_state, int): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) random_state = cp.random.RandomState(seed=random_state) elif isinstance(random_state, cp.random.RandomState): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) elif isinstance(random_state, np.random.RandomState): idxs = np.arange(X.shape[0]) else: raise TypeError("`random_state` must be an int, NumPy RandomState \ or CuPy RandomState.") random_state.shuffle(idxs) if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series): X = X.iloc[idxs].reset_index(drop=True) elif cuda.is_cuda_array(X): # numba (and therefore rmm device_array) does not support # fancy indexing if cuda.devicearray.is_cuda_ndarray(X): x_numba = True X = cp.asarray(X)[idxs] if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series): y = y.iloc[idxs] elif cuda.is_cuda_array(y): if cuda.devicearray.is_cuda_ndarray(y): y_numba = True y = cp.asarray(y)[idxs] # Determining sizes of splits if isinstance(train_size, float): train_size = int(X.shape[0] * train_size) if test_size is None: if train_size is None: train_size = int(X.shape[0] * 0.75) test_size = X.shape[0] - train_size if isinstance(test_size, float): test_size = int(X.shape[0] * test_size) if train_size is None: train_size = X.shape[0] - test_size elif isinstance(test_size, int): if train_size is None: train_size = X.shape[0] - test_size if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix): X_train = X[0:train_size] y_train = y[0:train_size] elif isinstance(X, cudf.DataFrame): X_train = X.iloc[0:train_size] y_train = y.iloc[0:train_size] if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix): X_test = X[-1 * test_size:] y_test = y[-1 * test_size:] elif isinstance(y, cudf.DataFrame): X_test = X.iloc[-1 * test_size:] y_test = y.iloc[-1 * test_size:] if x_numba: X_train = cuda.as_cuda_array(X_train) X_test = cuda.as_cuda_array(X_test) if y_numba: y_train = cuda.as_cuda_array(y_train) y_test = cuda.as_cuda_array(y_test) return X_train, X_test, y_train, y_test