Exemple #1
0
def to_sp_dask_array(cudf_or_array, client=None):
    """
    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
    CSR matrices. Unfortunately, due to current limitations in Dask, there is
    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
    dask.Array without copying to host.


    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
    https://github.com/dask/dask/issues/5604 are implemented, compute()
    will not be able to be called on a Dask.array that is backed with
    sparse CuPy arrays because they lack the necessary functionality
    to be stacked into a single array. The array returned from this
    utility will, however, still be able to be passed into functions
    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
    Naive Bayes).

    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387

    Parameters
    ----------
    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
                    Dask DataFrame/Array
    client : dask.distributed.Client (optional) Dask client

    dtype : output dtype

    Returns
    -------
    dask_array : dask.Array backed by cupy.sparse.csr_matrix
    """
    client = default_client() if client is None else client

    # Makes sure the MatDescriptor workaround for CuPy sparse arrays
    # is loaded (since Dask lazy-loaded serialization in cuML is only
    # executed when object from the cuML package needs serialization.
    # This can go away once the MatDescriptor pickling bug is fixed
    # in CuPy.
    # Ref: https://github.com/cupy/cupy/issues/3061
    from cuml.comm import serialize  # NOQA

    shape = cudf_or_array.shape
    if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \
       isinstance(cudf_or_array, cudf.DataFrame):
        dtypes = np.unique(cudf_or_array.dtypes)

        if len(dtypes) > 1:
            raise ValueError("DataFrame should contain only a single dtype")

        dtype = dtypes[0]
    else:
        dtype = cudf_or_array.dtype

    meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))

    if isinstance(cudf_or_array, dask.array.Array):
        # At the time of developing this, using map_blocks will not work
        # to convert a Dask.Array to CuPy sparse arrays underneath.

        parts = client.sync(_extract_partitions, cudf_or_array)
        cudf_or_array = [
            client.submit(_conv_np_to_df, part, workers=[w])
            for w, part in parts
        ]

        cudf_or_array = to_dask_cudf(cudf_or_array)

    if isinstance(cudf_or_array, dask.dataframe.DataFrame):
        """
        Dask.Dataframe needs special attention since it has multiple dtypes.
        Just use the first (and assume all the rest are the same)
        """
        cudf_or_array = cudf_or_array.map_partitions(
            _conv_df_to_sp, meta=dask.array.from_array(meta))

        # This will also handle the input of dask.array.Array
        return cudf_or_array

    else:
        if scipy.sparse.isspmatrix(cudf_or_array):
            cudf_or_array = \
                cupyx.scipy.sparse.csr_matrix(cudf_or_array.tocsr())
        elif cupyx.scipy.sparse.isspmatrix(cudf_or_array):
            pass
        elif isinstance(cudf_or_array, cudf.DataFrame):
            cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype)
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary)
        elif isinstance(cudf_or_array, np.ndarray):
            cupy_ary = rmm_cupy_ary(cp.asarray,
                                    cudf_or_array,
                                    dtype=cudf_or_array.dtype)
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary)

        elif isinstance(cudf_or_array, cp.core.core.ndarray):
            cudf_or_array = cupyx.scipy.sparse.csr_matrix(cudf_or_array)
        else:
            raise ValueError("Unexpected input type %s" % type(cudf_or_array))

        # Push to worker
        cudf_or_array = client.scatter(cudf_or_array)

    return dask.array.from_delayed(cudf_or_array, shape=shape, meta=meta)
Exemple #2
0
def to_output(futures, type, client=None):
    if type == 'cupy':
        return to_dask_cupy(futures, client=client)
    else:
        return to_dask_cudf(futures, client=client)
Exemple #3
0
def to_sparse_dask_array(cudf_or_array, client=None):
    """
    Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
    CSR matrices. Unfortunately, due to current limitations in Dask, there is
    no direct path to convert a cupy.sparse.spmatrix into a CuPy backed
    dask.Array without copying to host.


    NOTE: Until https://github.com/cupy/cupy/issues/2655 and
    https://github.com/dask/dask/issues/5604 are implemented, compute()
    will not be able to be called on a Dask.array that is backed with
    sparse CuPy arrays because they lack the necessary functionality
    to be stacked into a single array. The array returned from this
    utility will, however, still be able to be passed into functions
    that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
    Naive Bayes).

    Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387

    Parameters
    ----------
    cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or
                    Dask DataFrame/Array
    client : dask.distributed.Client (optional) Dask client

    dtype : output dtype

    Returns
    -------
    dask_array : dask.Array backed by cupy.sparse.csr_matrix
    """
    client = default_client() if client is None else client

    # Makes sure the MatDescriptor workaround for CuPy sparse arrays
    # is loaded (since Dask lazy-loaded serialization in cuML is only
    # executed when object from the cuML package needs serialization.
    # This can go away once the MatDescriptor pickling bug is fixed
    # in CuPy.
    # Ref: https://github.com/cupy/cupy/issues/3061
    from cuml.comm import serialize  # NOQA

    shape = cudf_or_array.shape

    meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))

    ret = cudf_or_array

    # If we have a Dask array, convert it to a Dask DataFrame
    if isinstance(ret, dask.array.Array):
        # At the time of developing this, using map_blocks will not work
        # to convert a Dask.Array to CuPy sparse arrays underneath.

        def _conv_np_to_df(x):
            cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
            return cudf.DataFrame.from_gpu_matrix(cupy_ary)

        parts = client.sync(_extract_partitions, ret)
        futures = [
            client.submit(_conv_np_to_df, part, workers=[w], pure=False)
            for w, part in parts
        ]

        ret = to_dask_cudf(futures)

    # If we have a Dask Dataframe, use `map_partitions` to convert it
    # to a Sparse Cupy-backed Dask Array. This will also convert the dense
    # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot
    # use map_blocks on the array, but we can use `map_partitions` on the
    # Dataframe.
    if isinstance(ret, dask.dataframe.DataFrame):
        ret = ret.map_partitions(_conv_df_to_sparse,
                                 meta=dask.array.from_array(meta))

        # This will also handle the input of dask.array.Array
        return ret

    else:

        ret = _conv_array_to_sparse(ret)

        # Push to worker
        final_result = client.scatter(ret)

        return dask.array.from_delayed(final_result, shape=shape, meta=meta)