Ejemplo n.º 1
0
    def fit(self, y):
        """Fit label binarizer`

        Parameters
        ----------
        y : Dask.Array of shape [n_samples,] or [n_samples, n_classes]
            chunked by row.
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """

        # Take the unique classes and broadcast them all around the cluster.
        futures = self.client_.sync(_extract_partitions, y)

        unique = [
            self.client_.submit(LabelBinarizer._func_unique_classes, f)
            for w, f in futures
        ]

        classes = self.client_.compute(unique, True)
        self.classes_ = rmm_cupy_ary(cp.unique,
                                     rmm_cupy_ary(cp.stack, classes, axis=0))

        self.model = LB(**self.kwargs).fit(self.classes_)

        return self
Ejemplo n.º 2
0
    def fit(self, y):
        """
        Fit label binarizer

        Parameters
        ----------
        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """

        if y.ndim > 2:
            raise ValueError("labels cannot be greater than 2 dimensions")

        if y.ndim == 2:

            unique_classes = rmm_cupy_ary(cp.unique, y)
            if unique_classes != [0, 1]:
                raise ValueError("2-d array can must be binary")

            self.classes_ = rmm_cupy_ary(cp.arange, 0, y.shape[1])
        else:
            self.classes_ = rmm_cupy_ary(cp.unique, y).astype(y.dtype)

        cp.cuda.Stream.null.synchronize()

        return self
Ejemplo n.º 3
0
    def inverse_transform(self, y, threshold=None):
        """
        Transform binary labels back to original multi-class labels

        Parameters
        ----------

        y : array of shape [n_samples, n_classes]
        threshold : float this value is currently ignored

        Returns
        -------

        arr : array with original labels
        """

        # If we are already given multi-class, just return it.
        if cp.sparse.isspmatrix(y):
            y_mapped = y.tocsr().indices.astype(self.classes_.dtype)
        elif scipy.sparse.isspmatrix(y):
            y = y.tocsr()
            y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype)
        else:
            y_mapped = rmm_cupy_ary(cp.argmax,
                                    rmm_cupy_ary(cp.asarray, y, dtype=y.dtype),
                                    axis=1).astype(y.dtype)

        return invert_labels(y_mapped, self.classes_)
Ejemplo n.º 4
0
def create_local_data(m,
                      n,
                      centers,
                      cluster_std,
                      random_state,
                      dtype,
                      type,
                      order='F'):
    X, y = skl_make_blobs(m,
                          n,
                          centers=centers,
                          cluster_std=cluster_std,
                          random_state=random_state)

    if type == 'array':
        X = rmm_cupy_ary(cp.asarray, X.astype(dtype), order=order)
        y = rmm_cupy_ary(cp.asarray, y.astype(dtype),
                         order=order).reshape(m, 1)

    elif type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X.astype(dtype)))
        y = cudf.DataFrame.from_pandas(pd.DataFrame(y))

    else:
        raise ValueError('type must be array or dataframe')

    return X, y
Ejemplo n.º 5
0
    def transform(self, y):
        """
        Transform and return encoded labels

        Parameters
        ----------
        y : Dask.Array of shape [n_samples,] or [n_samples, n_classes]

        Returns
        -------

        arr : Dask.Array backed by CuPy arrays containing encoded labels
        """

        parts = self.client_.sync(_extract_partitions, y)

        xform_func = dask.delayed(LabelBinarizer._func_xform)
        meta = rmm_cupy_ary(cp.zeros, 1)
        if self.model.sparse_output:
            meta = cp.sparse.csr_matrix(meta)
        f = [
            dask.array.from_delayed(xform_func(self.model, part),
                                    meta=meta,
                                    dtype=cp.float32,
                                    shape=(len(y), len(self.classes_)))
            for w, part in parts
        ]

        arr = dask.array.asarray(f)
        return arr.reshape(arr.shape[1:])
Ejemplo n.º 6
0
    def inverse_transform(self, y, threshold=None):
        """
        Invert a set of encoded labels back to original labels

        Parameters
        ----------

        y : Dask.Array of shape [n_samples, n_classes] containing encoded
            labels

        threshold : float This value is currently ignored

        Returns
        -------

        arr : Dask.Array backed by CuPy arrays containing original labels
        """

        parts = self.client_.sync(_extract_partitions, y)
        inv_func = dask.delayed(LabelBinarizer._func_inv_xform)

        dtype = self.classes_.dtype
        meta = rmm_cupy_ary(cp.zeros, 1, dtype=dtype)

        f = [
            dask.array.from_delayed(inv_func(self.model, part, threshold),
                                    dtype=dtype,
                                    shape=(y.shape[0], ),
                                    meta=meta) for w, part in parts
        ]

        ret = dask.array.stack(f, axis=0)
        return ret.reshape(ret.shape[1:])
Ejemplo n.º 7
0
def make_monotonic(labels, classes=None, copy=False):
    """
    Takes a set of labels that might not be drawn from the
    set [0, n-1] and renumbers them to be drawn that
    interval.

    Parameters
    ----------

    labels : array-like of size (n,) labels to convert
    classes : array-like of size (n_classes,) the unique
              set of classes in the set of labels
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    mapped_labels : array-like of size (n,)
    classes : array-like of size (n_classes,)
    """

    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)

    if copy:
        labels = labels.copy()

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    if classes is None:
        classes = rmm_cupy_ary(cp.unique, labels)

    smem = labels.dtype.itemsize * int(classes.shape[0])

    map_labels = _map_kernel(labels.dtype)
    map_labels((math.ceil(labels.shape[0] / 32), ), (32, ),
               (labels, labels.shape[0], classes, classes.shape[0]),
               shared_mem=smem)

    return labels, classes
Ejemplo n.º 8
0
def invert_labels(labels, classes, copy=False):
    """
    Takes a set of labels that have been mapped to be drawn
    from a monotonically increasing set and inverts them to
    back to the original set of classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to invert
    classes : array-like of size (n_classes,) the unique set
              of classes for inversion. It is assumed that
              the classes are ordered by their corresponding
              monotonically increasing label.
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    inverted labels : array-like of size (n,)

    """

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))
    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)
    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)

    if copy:
        labels = labels.copy()

    smem = labels.dtype.itemsize * len(classes)
    inverse_map = _inverse_map_kernel(labels.dtype)
    inverse_map((math.ceil(len(labels) / 32), ), (32, ),
                (classes, len(classes), labels, len(labels)),
                shared_mem=smem)

    return labels
Ejemplo n.º 9
0
def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False):
    """
    A stateless helper function to dummy encode multi-class labels.

    Parameters
    ----------

    y : array-like of size [n_samples,] or [n_samples, n_classes]
    classes : the set of unique classes in the input
    neg_label : integer the negative value for transformed output
    pos_label : integer the positive value for transformed output
    sparse_output : bool whether to return sparse array
    """

    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)
    labels = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)

    if not check_labels(labels, classes):
        raise ValueError("Unseen classes encountered in input")

    row_ind = rmm_cupy_ary(cp.arange, 0, labels.shape[0], 1, dtype=y.dtype)
    col_ind, _ = make_monotonic(labels, classes, copy=True)

    val = rmm_cupy_ary(cp.full, row_ind.shape[0], pos_label, dtype=y.dtype)

    sp = cp.sparse.coo_matrix((val, (row_ind, col_ind)),
                              shape=(col_ind.shape[0], classes.shape[0]),
                              dtype=cp.float32)

    cp.cuda.Stream.null.synchronize()

    if sparse_output:
        sp = sp.tocsr()
        return sp
    else:

        arr = sp.toarray().astype(y.dtype)
        arr[arr == 0] = neg_label

        return arr
Ejemplo n.º 10
0
def row_matrix(df):
    """Compute the C (row major) version gpu matrix of df

    :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass.
        If already on the device, its stream will be used to perform the
        transpose (and to copy `row_major` to the device if necessary).

    """

    col_major = df.as_gpu_matrix(order='F')

    row_major = rmm_cupy_ary(cp.array, col_major, order='C')

    return cuda.as_cuda_array(row_major)
Ejemplo n.º 11
0
def check_labels(labels, classes):
    """
    Validates that a set of labels is drawn from the unique
    set of given classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to validate
    classes : array-like of size (n_classes,) the unique
              set of classes to verify

    Returns
    -------

    result : boolean
    """

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))

    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)
    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    valid = cp.array([1])

    smem = labels.dtype.itemsize * int(classes.shape[0])
    validate = _validate_kernel(labels.dtype)
    validate((math.ceil(labels.shape[0] / 32), ), (32, ),
             (labels, labels.shape[0], classes, classes.shape[0], valid),
             shared_mem=smem)

    return valid[0] == 1
Ejemplo n.º 12
0
def _conv_array_to_sparse(arr):
    """
    Converts an array (or cudf.DataFrame) to a sparse array
    :param arr: scipy or cupy sparse matrix, cudf DataFrame,
                dense numpy or cupy array
    :return: cupy sparse CSR matrix
    """
    if scipy.sparse.isspmatrix(arr):
        ret = \
            cupyx.scipy.sparse.csr_matrix(arr.tocsr())
    elif cupyx.scipy.sparse.isspmatrix(arr):
        ret = arr
    elif isinstance(arr, cudf.DataFrame):
        ret = _conv_df_to_sparse(arr)
    elif isinstance(arr, np.ndarray):
        cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype)
        ret = cupyx.scipy.sparse.csr_matrix(cupy_ary)

    elif isinstance(arr, cp.core.core.ndarray):
        ret = cupyx.scipy.sparse.csr_matrix(arr)
    else:
        raise ValueError("Unexpected input type %s" % type(arr))
    return ret
Ejemplo n.º 13
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    # temporarily importing here, until github issue #1681 reorganizing utils
    # is dealt with. Otherwise circular import causes issues
    from cuml.common import CumlArray

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = rmm_cupy_ary(cp.asarray, X)
        X_m = X_m.astype(to_dtype)
        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Ejemplo n.º 14
0
def to_sp_dask_array(cudf_or_array, client=None):
    """
    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
    CSR matrices. Unfortunately, due to current limitations in Dask, there is
    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
    dask.Array without copying to host.


    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
    https://github.com/dask/dask/issues/5604 are implemented, compute()
    will not be able to be called on a Dask.array that is backed with
    sparse CuPy arrays because they lack the necessary functionality
    to be stacked into a single array. The array returned from this
    utility will, however, still be able to be passed into functions
    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
    Naive Bayes).

    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387

    Parameters
    ----------
    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
                    Dask DataFrame/Array
    client : dask.distributed.Client (optional) Dask client

    dtype : output dtype

    Returns
    -------
    dask_array : dask.Array backed by cupy.sparse.csr_matrix
    """
    client = default_client() if client is None else client

    # Makes sure the MatDescriptor workaround for CuPy sparse arrays
    # is loaded (since Dask lazy-loaded serialization in cuML is only
    # executed when object from the cuML package needs serialization.
    # This can go away once the MatDescriptor pickling bug is fixed
    # in CuPy.
    # Ref: https://github.com/cupy/cupy/issues/3061
    from cuml.comm import serialize  # NOQA

    shape = cudf_or_array.shape
    if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \
       isinstance(cudf_or_array, cudf.DataFrame):
        dtypes = np.unique(cudf_or_array.dtypes)

        if len(dtypes) > 1:
            raise ValueError("DataFrame should contain only a single dtype")

        dtype = dtypes[0]
    else:
        dtype = cudf_or_array.dtype

    meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))

    if isinstance(cudf_or_array, dask.array.Array):
        # At the time of developing this, using map_blocks will not work
        # to convert a Dask.Array to CuPy sparse arrays underneath.

        parts = client.sync(_extract_partitions, cudf_or_array)
        cudf_or_array = [
            client.submit(_conv_np_to_df, part, workers=[w])
            for w, part in parts
        ]

        cudf_or_array = to_dask_cudf(cudf_or_array)

    if isinstance(cudf_or_array, dask.dataframe.DataFrame):
        """
        Dask.Dataframe needs special attention since it has multiple dtypes.
        Just use the first (and assume all the rest are the same)
        """
        cudf_or_array = cudf_or_array.map_partitions(
            _conv_df_to_sp, meta=dask.array.from_array(meta))

        # This will also handle the input of dask.array.Array
        return cudf_or_array

    else:
        if scipy.sparse.isspmatrix(cudf_or_array):
            cudf_or_array = \
                cupyx.scipy.sparse.csr_matrix(cudf_or_array.tocsr())
        elif cupyx.scipy.sparse.isspmatrix(cudf_or_array):
            pass
        elif isinstance(cudf_or_array, cudf.DataFrame):
            cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype)
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary)
        elif isinstance(cudf_or_array, np.ndarray):
            cupy_ary = rmm_cupy_ary(cp.asarray,
                                    cudf_or_array,
                                    dtype=cudf_or_array.dtype)
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary)

        elif isinstance(cudf_or_array, cp.core.core.ndarray):
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cudf_or_array)
        else:
            raise ValueError("Unexpected input type %s" % type(cudf_or_array))

        # Push to worker
        cudf_or_array = client.scatter(cudf_or_array)

    return dask.array.from_delayed(cudf_or_array, shape=shape, meta=meta)
Ejemplo n.º 15
0
def _conv_np_to_df(x):
    cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
    return cudf.DataFrame.from_gpu_matrix(cupy_ary)
Ejemplo n.º 16
0
def _conv_df_to_sp(x):
    cupy_ary = rmm_cupy_ary(cp.asarray, x.as_gpu_matrix(), dtype=x.dtypes[0])

    return cp.sparse.csr_matrix(cupy_ary)
Ejemplo n.º 17
0
def make_low_rank_matrix(n_samples=100,
                         n_features=100,
                         effective_rank=10,
                         tail_strength=0.5,
                         random_state=None,
                         n_parts=1,
                         n_samples_per_part=None):
    """ Generate a mostly low rank matrix with bell-shaped singular values

    Parameters
    ----------
    n_samples : int, optional (default=100)
        The number of samples.
    n_features : int, optional (default=100)
        The number of features.
    effective_rank : int, optional (default=10)
        The approximate number of singular vectors required to explain most of
        the data by linear combinations.
    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy tail of the singular values
        profile.
    random_state : int, CuPy RandomState instance, Dask RandomState instance
                   or None (default)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.
    n_parts : int, optional (default=1)
        The number of parts of work.

    Returns
    -------
    X : Dask-CuPy array of shape [n_samples, n_features]
        The matrix.
    """
    rs = create_rs_generator(random_state)
    n = min(n_samples, n_features)

    def generate_chunks_for_qr(total_size, min_size, n_parts):

        n_total_per_part = max(1, int(total_size / n_parts))
        if n_total_per_part > min_size:
            min_size = n_total_per_part

        n_partitions = int(max(1, total_size / min_size))
        rest = total_size % (n_partitions * min_size)
        chunks_list = [min_size for i in range(n_partitions - 1)]
        chunks_list.append(min_size + rest)
        return tuple(chunks_list)

    # Random (ortho normal) vectors
    m1 = rs.standard_normal(
        (n_samples, n),
        chunks=(generate_chunks_for_qr(n_samples, n, n_parts), -1))
    u, _ = da.linalg.qr(m1)

    m2 = rs.standard_normal(
        (n, n_features),
        chunks=(-1, generate_chunks_for_qr(n_features, n, n_parts)))
    v, _ = da.linalg.qr(m2)

    # For final multiplication
    if n_samples_per_part is None:
        n_samples_per_part = max(1, int(n_samples / n_parts))
    u = u.rechunk({0: n_samples_per_part, 1: -1})
    v = v.rechunk({0: n_samples_per_part, 1: -1})

    # Index of the singular values
    sing_ind = rmm_cupy_ary(cp.arange, n, dtype=cp.float64)

    # Build the singular profile by assembling signal and noise components
    tmp = sing_ind / effective_rank
    low_rank = ((1 - tail_strength) * rmm_cupy_ary(cp.exp, -1.0 * tmp**2))
    tail = tail_strength * rmm_cupy_ary(cp.exp, -0.1 * tmp)
    local_s = low_rank + tail
    s = da.from_array(local_s, chunks=(int(n_samples_per_part), ))

    u *= s
    return da.dot(u, v)
Ejemplo n.º 18
0
def input_to_cuml_array(X,
                        order='F',
                        deepcopy=False,
                        check_dtype=False,
                        convert_to_dtype=False,
                        check_cols=False,
                        check_rows=False,
                        fail_on_order=False):
    """
    Convert input X to CumlArray.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`.
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True

    Parameters
    ----------

    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: string (default: 'F')
        Whether to return a F-major or C-major array. Used to check the order
        of the input. If fail_on_order=True method will raise ValueError,
        otherwise it will convert X to be of order `order`.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.

    Returns
    -------
    `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype')

        A new CumlArray and associated data.

    """

    # temporarily importing here, until github issue #1681 reorganizing utils
    # is dealt with. Otherwise circular import causes issues
    from cuml.common import CumlArray

    # dtype conversion

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    # format conversion

    if (isinstance(X, cudf.Series)):
        if X.null_count != 0:
            raise ValueError("Error: cuDF Series has missing/null values, " +
                             " which are not supported by cuML.")

    if isinstance(X, cudf.DataFrame):
        if order == 'F':
            X_m = CumlArray(data=X.as_gpu_matrix(order='F'))
        elif order == 'C':
            X_m = CumlArray(data=cuml.utils.numba_utils.row_matrix(X))

    elif cuda.is_cuda_array(X) or isinstance(X, np.ndarray):
        X_m = CumlArray(data=X)

        if deepcopy:
            X_m = copy.deepcopy(X_m)

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    if check_dtype:
        if not isinstance(check_dtype, list):
            check_dtype = [check_dtype]

        check_dtype = [np.dtype(dtype) for dtype in check_dtype]

        if X_m.dtype not in check_dtype:
            type_str = X_m.dtype
            del X_m
            raise TypeError("Expected input to be of type in " +
                            str(check_dtype) + " but got " + str(type_str))

    # Checks based on parameters

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if X_m.order != order:
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = CumlArray(data=X_m)

    return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
Ejemplo n.º 19
0
 def _unique(x):
     return rmm_cupy_ary(cp.unique, x)
Ejemplo n.º 20
0
 def _func_inv_xform(model, y, threshold):
     y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return model.inverse_transform(y, threshold)
Ejemplo n.º 21
0
 def _func_xform(model, y):
     xform_in = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return model.transform(xform_in)
Ejemplo n.º 22
0
def to_sparse_dask_array(cudf_or_array, client=None):
    """
    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
    CSR matrices. Unfortunately, due to current limitations in Dask, there is
    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
    dask.Array without copying to host.


    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
    https://github.com/dask/dask/issues/5604 are implemented, compute()
    will not be able to be called on a Dask.array that is backed with
    sparse CuPy arrays because they lack the necessary functionality
    to be stacked into a single array. The array returned from this
    utility will, however, still be able to be passed into functions
    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
    Naive Bayes).

    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387

    Parameters
    ----------
    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
                    Dask DataFrame/Array
    client : dask.distributed.Client (optional) Dask client

    dtype : output dtype

    Returns
    -------
    dask_array : dask.Array backed by cupy.sparse.csr_matrix
    """
    client = default_client() if client is None else client

    # Makes sure the MatDescriptor workaround for CuPy sparse arrays
    # is loaded (since Dask lazy-loaded serialization in cuML is only
    # executed when object from the cuML package needs serialization.
    # This can go away once the MatDescriptor pickling bug is fixed
    # in CuPy.
    # Ref: https://github.com/cupy/cupy/issues/3061
    from cuml.comm import serialize  # NOQA

    shape = cudf_or_array.shape

    meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))

    ret = cudf_or_array

    # If we have a Dask array, convert it to a Dask DataFrame
    if isinstance(ret, dask.array.Array):
        # At the time of developing this, using map_blocks will not work
        # to convert a Dask.Array to CuPy sparse arrays underneath.

        def _conv_np_to_df(x):
            cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
            return cudf.DataFrame.from_gpu_matrix(cupy_ary)

        parts = client.sync(_extract_partitions, ret)
        futures = [
            client.submit(_conv_np_to_df, part, workers=[w], pure=False)
            for w, part in parts
        ]

        ret = to_dask_cudf(futures)

    # If we have a Dask Dataframe, use `map_partitions` to convert it
    # to a Sparse Cupy-backed Dask Array. This will also convert the dense
    # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot
    # use map_blocks on the array, but we can use `map_partitions` on the
    # Dataframe.
    if isinstance(ret, dask.dataframe.DataFrame):
        ret = ret.map_partitions(_conv_df_to_sparse,
                                 meta=dask.array.from_array(meta))

        # This will also handle the input of dask.array.Array
        return ret

    else:

        ret = _conv_array_to_sparse(ret)

        # Push to worker
        final_result = client.scatter(ret)

        return dask.array.from_delayed(final_result, shape=shape, meta=meta)
Ejemplo n.º 23
0
def train_test_split(X,
                     y,
                     test_size: Union[float, int] = None,
                     train_size: Union[float, int] = None,
                     shuffle: bool = True,
                     random_state: Union[int, cp.random.RandomState,
                                         np.random.RandomState] = None,
                     seed: Union[int, cp.random.RandomState,
                                 np.random.RandomState] = None):
    """
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split`

    Parameters
    ----------
    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default
    seed: random_state : int, CuPy RandomState or NumPy RandomState optional
        Deprecated in favor of `random_state`.
        If shuffle is true, seeds the generator. Unseeded by default

    Examples
    --------
    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
                                                            train_size=0.8)
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'])

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                            train_size=0.8)

    Output:

    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    Returns
    -------
    X_train, X_test, y_train, y_test : cudf.DataFrame
        Partitioned dataframes. If `y` was provided as a column name, the
        column was dropped from the `X`s
    """
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name)
        else:
            raise TypeError("X needs to be a cuDF Dataframe when y is a \
                             string")

    # todo: this check will be replaced with upcoming improvements
    # to input_utils with PR #1379
    if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = False
    y_numba = False

    if seed is not None:
        if random_state is None:
            warnings.warn("Parameter 'seed' is deprecated, please use \
                          'random_state' instead.")
            random_state = seed
        else:
            warnings.warn("Both 'seed' and 'random_state' parameters were \
                          set, using 'random_state' since 'seed' is \
                          deprecated. ")

    if shuffle:
        if random_state is None or isinstance(random_state, int):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

        else:
            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")

        random_state.shuffle(idxs)

        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs].reset_index(drop=True)

        elif cuda.is_cuda_array(X):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            if cuda.devicearray.is_cuda_ndarray(X):
                x_numba = True
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif cuda.is_cuda_array(y):
            if cuda.devicearray.is_cuda_ndarray(y):
                y_numba = True
            y = cp.asarray(y)[idxs]

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix):
        X_train = X[0:train_size]
        y_train = y[0:train_size]
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        y_train = y.iloc[0:train_size]

    if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix):
        X_test = X[-1 * test_size:]
        y_test = y[-1 * test_size:]
    elif isinstance(y, cudf.DataFrame):
        X_test = X.iloc[-1 * test_size:]
        y_test = y.iloc[-1 * test_size:]

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    return X_train, X_test, y_train, y_test
Ejemplo n.º 24
0
 def _count_accurate_predictions(y_hat, y):
     y_hat = rmm_cupy_ary(cp.asarray, y_hat, dtype=y_hat.dtype)
     y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype)
     return y.shape[0] - cp.count_nonzero(y - y_hat)
Ejemplo n.º 25
0
def input_to_dev_array(X,
                       order='F',
                       deepcopy=False,
                       check_dtype=False,
                       convert_to_dtype=False,
                       check_cols=False,
                       check_rows=False,
                       fail_on_order=False):
    """
    Convert input X to device array suitable for C++ methods.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`.
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True

    Parameters
    ----------

    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: string (default: 'F')
        Whether to return a F-major or C-major array. Used to check the order
        of the input. If fail_on_order=True method will raise ValueError,
        otherwise it will convert X to be of order `order`.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.


    Returns
    -------
    `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype')

        A new device array if the input was not a numba device
        array. It is a reference to the input X if it was a numba device array
        or cuda array interface compliant (like cupy)

    """

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if order == 'F':
            X_m = X.as_gpu_matrix(order='F')
        elif order == 'C':
            X_m = cuml.utils.numba_utils.row_matrix(X)

    elif (isinstance(X, cudf.Series)):
        if deepcopy:
            X_m = X.to_gpu_array()
        else:
            if X.null_count == 0:
                # using __cuda_array_interface__ support of cudf.Series for
                # this temporarily while switching from rmm device_array to
                # rmm deviceBuffer https://github.com/rapidsai/cuml/issues/1379
                X_m = cuda.as_cuda_array(X._column)
            else:
                raise ValueError("Error: cuDF Series has missing/null values")

    elif isinstance(X, np.ndarray):
        dtype = X.dtype
        X_m = rmm.to_device(np.array(X, order=order, copy=False))

    elif cuda.is_cuda_array(X):
        # Use cuda array interface to create a device array by reference
        X_m = cuda.as_cuda_array(X)

        if deepcopy:
            out_dev_array = rmm.device_array_like(X_m)
            out_dev_array.copy_to_device(X_m)
            X_m = out_dev_array

    elif cuda.devicearray.is_cuda_ndarray(X):
        if deepcopy:
            out_dev_array = rmm.device_array_like(X)
            out_dev_array.copy_to_device(X)
            X_m = out_dev_array
        else:
            X_m = X

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    dtype = X_m.dtype

    if check_dtype:
        if isinstance(check_dtype, type) or isinstance(check_dtype, np.dtype):
            if dtype != check_dtype:
                del X_m
                raise TypeError("Expected " + str(check_dtype) + "input but" +
                                " got " + str(dtype) + " instead.")
        elif isinstance(check_dtype, Collection) and \
                not isinstance(check_dtype, str):
            # The 'not isinstance(check_dtype, string)' condition is needed,
            # because the 'float32' string is a Collection, but in this
            # branch we only want to process collections like
            # [np.float32, np.float64].
            if dtype not in check_dtype:
                del X_m
                raise TypeError("Expected input to be of type in " +
                                str(check_dtype) + " but got " + str(dtype))
        else:
            raise ValueError("Expected a type as check_dtype arg, but got " +
                             str(check_dtype))

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if not check_numba_order(X_m, order):
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = cuda.as_cuda_array(X_m)

    X_ptr = get_dev_array_ptr(X_m)

    return inp_array(array=X_m,
                     pointer=X_ptr,
                     n_rows=n_rows,
                     n_cols=n_cols,
                     dtype=dtype)
Ejemplo n.º 26
0
 def _func_unique_classes(y):
     return rmm_cupy_ary(cp.unique, y)