Ejemplo n.º 1
    def fit(self, y):
        Fit label binarizer

        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        self : returns an instance of self.

        if y.ndim > 2:
            raise ValueError("labels cannot be greater than 2 dimensions")

        if y.ndim == 2:

            unique_classes = rmm_cupy_ary(cp.unique, y)
            if unique_classes != [0, 1]:
                raise ValueError("2-d array can must be binary")

            self.classes_ = rmm_cupy_ary(cp.arange, 0, y.shape[1])
            self.classes_ = rmm_cupy_ary(cp.unique, y).astype(y.dtype)


        return self
Ejemplo n.º 2
    def inverse_transform(self, y, threshold=None):
        Transform binary labels back to original multi-class labels


        y : array of shape [n_samples, n_classes]
        threshold : float this value is currently ignored


        arr : array with original labels

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
            from cuml.common.import_utils import dummy_function_always_false \
                    as scipy_sparse_isspmatrix

        # If we are already given multi-class, just return it.
        if cupyx.scipy.sparse.isspmatrix(y):
            y_mapped = y.tocsr().indices.astype(self._classes_.dtype)
        elif scipy_sparse_isspmatrix(y):
            y = y.tocsr()
            y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype)
            y_mapped = rmm_cupy_ary(cp.argmax,
                                    rmm_cupy_ary(cp.asarray, y, dtype=y.dtype),

        return invert_labels(y_mapped, self._classes_)
Ejemplo n.º 3
    def fit(self, y):
        """Fit label binarizer

        y : Dask.Array of shape [n_samples,] or [n_samples, n_classes]
            chunked by row.
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        self : returns an instance of self.

        # Take the unique classes and broadcast them all around the cluster.
        futures = self.client.sync(_extract_partitions, y)

        unique = [
            self.client.submit(LabelBinarizer._func_unique_classes, f)
            for w, f in futures

        classes = self.client.compute(unique, True)
        classes = rmm_cupy_ary(cp.unique,
                               rmm_cupy_ary(cp.stack, classes, axis=0))


        return self
Ejemplo n.º 4
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = rmm_cupy_ary(cp.asarray, X)
        X_m = X_m.astype(to_dtype)
        if legacy:
            return cuda.as_cuda_array(X_m)
            return CumlArray(data=X_m)

        raise TypeError("Received unsupported input type: %s" % type(X))

    return X
Ejemplo n.º 5
def _conv_array_to_sparse(arr):
    Converts an array (or cudf.DataFrame) to a sparse array
    :param arr: scipy or cupy sparse matrix, cudf DataFrame,
                dense numpy or cupy array
    :return: cupy sparse CSR matrix
    if has_scipy():
        from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        from cuml.common.import_utils import dummy_function_always_false \
            as scipy_sparse_isspmatrix
    if scipy_sparse_isspmatrix(arr):
        ret = \
    elif cupyx.scipy.sparse.isspmatrix(arr):
        ret = arr
    elif isinstance(arr, cudf.DataFrame):
        ret = _conv_df_to_sparse(arr)
    elif isinstance(arr, np.ndarray):
        cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype)
        ret = cupyx.scipy.sparse.csr_matrix(cupy_ary)

    elif isinstance(arr, cp.core.core.ndarray):
        ret = cupyx.scipy.sparse.csr_matrix(arr)
        raise ValueError("Unexpected input type %s" % type(arr))
    return ret
Ejemplo n.º 6
    def inverse_transform(self, y, threshold=None):
        Invert a set of encoded labels back to original labels


        y : Dask.Array of shape [n_samples, n_classes] containing encoded

        threshold : float This value is currently ignored


        arr : Dask.Array backed by CuPy arrays containing original labels

        parts = self.client.sync(_extract_partitions, y)
        inv_func = dask.delayed(LabelBinarizer._func_inv_xform)

        dtype = self.classes_.dtype
        meta = rmm_cupy_ary(cp.zeros, 1, dtype=dtype)

        internal_model = self._get_internal_model()

        f = [
            dask.array.from_delayed(inv_func(internal_model, part, threshold),
                                    shape=(y.shape[0], ),
                                    meta=meta) for w, part in parts

        ret = dask.array.stack(f, axis=0)
        return ret.reshape(ret.shape[1:])
Ejemplo n.º 7
    def transform(self, y):
        Transform and return encoded labels

        y : Dask.Array of shape [n_samples,] or [n_samples, n_classes]


        arr : Dask.Array backed by CuPy arrays containing encoded labels

        parts = self.client.sync(_extract_partitions, y)

        internal_model = self._get_internal_model()

        xform_func = dask.delayed(LabelBinarizer._func_xform)
        meta = rmm_cupy_ary(cp.zeros, 1)
        if internal_model.sparse_output:
            meta = cupyx.scipy.sparse.csr_matrix(meta)
        f = [
            dask.array.from_delayed(xform_func(internal_model, part),
                                    shape=(len(y), len(self.classes_)))
            for w, part in parts

        arr = dask.array.asarray(f)
        return arr.reshape(arr.shape[1:])
Ejemplo n.º 8
def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False):
    A stateless helper function to dummy encode multi-class labels.


    y : array-like of size [n_samples,] or [n_samples, n_classes]
    classes : the set of unique classes in the input
    neg_label : integer the negative value for transformed output
    pos_label : integer the positive value for transformed output
    sparse_output : bool whether to return sparse array

    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)
    labels = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)

    if not check_labels(labels, classes):
        raise ValueError("Unseen classes encountered in input")

    row_ind = rmm_cupy_ary(cp.arange, 0, labels.shape[0], 1, dtype=y.dtype)
    col_ind, _ = make_monotonic(labels, classes, copy=True)

    val = rmm_cupy_ary(cp.full, row_ind.shape[0], pos_label, dtype=y.dtype)

    sp = cupyx.scipy.sparse.coo_matrix(
        (val, (row_ind, col_ind)),
        shape=(col_ind.shape[0], classes.shape[0]),


    if sparse_output:
        sp = sp.tocsr()
        return sp

        arr = sp.toarray().astype(y.dtype)
        arr[arr == 0] = neg_label

        return arr
Ejemplo n.º 9
 def _func_inv_xform(model, y, threshold):
     y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return model.inverse_transform(y, threshold)
Ejemplo n.º 10
 def _func_xform(model, y):
     xform_in = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return model.transform(xform_in)
Ejemplo n.º 11
 def _func_unique_classes(y):
     return rmm_cupy_ary(cp.unique, y)
Ejemplo n.º 12
 def _unique(x):
     return rmm_cupy_ary(cp.unique, x)
Ejemplo n.º 13
 def _count_accurate_predictions(y_hat, y):
     y_hat = rmm_cupy_ary(cp.asarray, y_hat, dtype=y_hat.dtype)
     y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return y.shape[0] - cp.count_nonzero(y - y_hat)
Ejemplo n.º 14
def _conv_df_to_sparse(x):
    cupy_ary = rmm_cupy_ary(cp.asarray, x.to_cupy(), dtype=x.dtypes[0])

    return cupyx.scipy.sparse.csr_matrix(cupy_ary)
Ejemplo n.º 15
def _conv_df_to_sparse(x):
    cupy_ary = rmm_cupy_ary(cp.asarray, x.as_gpu_matrix(), dtype=x.dtypes[0])

    return cp.sparse.csr_matrix(cupy_ary)
Ejemplo n.º 16
 def _conv_np_to_df(x):
     cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
     return cudf.DataFrame.from_gpu_matrix(cupy_ary)
Ejemplo n.º 17
def input_to_cuml_array(X,
    Convert input X to CumlArray.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True


    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: 'F', 'C' or 'K' (default: 'F')
        Whether to return a F-major ('F'),  C-major ('C') array or Keep ('K')
        the order of X. Used to check the order of the input. If
        fail_on_order=True, the method will raise ValueError,
        otherwise it will convert X to be of order `order` if needed.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.

    `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype')

        A new CumlArray and associated data.


    # dtype conversion

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    # format conversion

    if (isinstance(X, cudf.Series)):
        if X.null_count != 0:
            raise ValueError("Error: cuDF Series has missing/null values, " +
                             " which are not supported by cuML.")

    # converting pandas to numpy before sending it to CumlArray
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        # pandas doesn't support custom order in to_numpy
        X = cp.asarray(X.to_numpy(copy=False), order=order)

    if isinstance(X, cudf.DataFrame):
        if order == 'K':
            X_m = CumlArray(data=X.as_gpu_matrix(order='F'))
            X_m = CumlArray(data=X.as_gpu_matrix(order=order))

    elif isinstance(X, CumlArray):
        X_m = X

    elif hasattr(X, "__array_interface__") or \
            hasattr(X, "__cuda_array_interface__"):
        X_m = CumlArray(data=X)

        if deepcopy:
            X_m = copy.deepcopy(X_m)

        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    if check_dtype:
        if not isinstance(check_dtype, list):
            check_dtype = [check_dtype]

        check_dtype = [np.dtype(dtype) for dtype in check_dtype]

        if X_m.dtype not in check_dtype:
            type_str = X_m.dtype
            del X_m
            raise TypeError("Expected input to be of type in " +
                            str(check_dtype) + " but got " + str(type_str))

    # Checks based on parameters

    n_rows = X_m.shape[0]

    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
        n_cols = 1

    if n_cols == 1 or n_rows == 1:
        order = 'K'

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if order != 'K' and X_m.order != order:
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = CumlArray(data=X_m)

    return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
Ejemplo n.º 18
def to_sparse_dask_array(cudf_or_array, client=None):
    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
    CSR matrices. Unfortunately, due to current limitations in Dask, there is
    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
    dask.Array without copying to host.

    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
    https://github.com/dask/dask/issues/5604 are implemented, compute()
    will not be able to be called on a Dask.array that is backed with
    sparse CuPy arrays because they lack the necessary functionality
    to be stacked into a single array. The array returned from this
    utility will, however, still be able to be passed into functions
    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
    Naive Bayes).

    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387

    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
                    Dask DataFrame/Array
    client : dask.distributed.Client (optional) Dask client

    dtype : output dtype

    dask_array : dask.Array backed by cupy.sparse.csr_matrix
    client = default_client() if client is None else client

    # Makes sure the MatDescriptor workaround for CuPy sparse arrays
    # is loaded (since Dask lazy-loaded serialization in cuML is only
    # executed when object from the cuML package needs serialization.
    # This can go away once the MatDescriptor pickling bug is fixed
    # in CuPy.
    # Ref: https://github.com/cupy/cupy/issues/3061
    from cuml.comm import serialize  # NOQA

    shape = cudf_or_array.shape

    meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))

    ret = cudf_or_array

    # If we have a Dask array, convert it to a Dask DataFrame
    if isinstance(ret, dask.array.Array):
        # At the time of developing this, using map_blocks will not work
        # to convert a Dask.Array to CuPy sparse arrays underneath.

        def _conv_np_to_df(x):
            cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
            return cudf.DataFrame.from_gpu_matrix(cupy_ary)

        parts = client.sync(_extract_partitions, ret)
        futures = [
            client.submit(_conv_np_to_df, part, workers=[w], pure=False)
            for w, part in parts

        ret = df_to_dask_cudf(futures)

    # If we have a Dask Dataframe, use `map_partitions` to convert it
    # to a Sparse Cupy-backed Dask Array. This will also convert the dense
    # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot
    # use map_blocks on the array, but we can use `map_partitions` on the
    # Dataframe.
    if isinstance(ret, dask.dataframe.DataFrame):
        ret = ret.map_partitions(_conv_df_to_sparse,

        # This will also handle the input of dask.array.Array
        return ret


        ret = _conv_array_to_sparse(ret)

        # Push to worker
        final_result = client.scatter(ret)

        return dask.array.from_delayed(final_result, shape=shape, meta=meta)
Ejemplo n.º 19
def train_test_split(X,
                     test_size: Union[float, int] = None,
                     train_size: Union[float, int] = None,
                     shuffle: bool = True,
                     random_state: Union[int, cp.random.RandomState,
                                         np.random.RandomState] = None,
                     seed: Union[int, cp.random.RandomState,
                                 np.random.RandomState] = None):
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split`

    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default
    seed: random_state : int, CuPy RandomState or NumPy RandomState optional
        Deprecated in favor of `random_state`.
        If shuffle is true, seeds the generator. Unseeded by default

    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'])

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,


    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    X_train, X_test, y_train, y_test : cudf.DataFrame
        Partitioned dataframes. If `y` was provided as a column name, the
        column was dropped from the `X`s
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name)
            raise TypeError("X needs to be a cuDF Dataframe when y is a \

    # todo: this check will be replaced with upcoming improvements
    # to input_utils with PR #1379
    if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = False
    y_numba = False

    if seed is not None:
        if random_state is None:
            warnings.warn("Parameter 'seed' is deprecated, please use \
                          'random_state' instead.")
            random_state = seed
            warnings.warn("Both 'seed' and 'random_state' parameters were \
                          set, using 'random_state' since 'seed' is \
                          deprecated. ")

    if shuffle:
        if random_state is None or isinstance(random_state, int):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")


        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs].reset_index(drop=True)

        elif cuda.is_cuda_array(X):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            if cuda.devicearray.is_cuda_ndarray(X):
                x_numba = True
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif cuda.is_cuda_array(y):
            if cuda.devicearray.is_cuda_ndarray(y):
                y_numba = True
            y = cp.asarray(y)[idxs]

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix):
        X_train = X[0:train_size]
        y_train = y[0:train_size]
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        y_train = y.iloc[0:train_size]

    if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix):
        X_test = X[-1 * test_size:]
        y_test = y[-1 * test_size:]
    elif isinstance(y, cudf.DataFrame):
        X_test = X.iloc[-1 * test_size:]
        y_test = y.iloc[-1 * test_size:]

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    return X_train, X_test, y_train, y_test