def to_sp_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \ isinstance(cudf_or_array, cudf.DataFrame): dtypes = np.unique(cudf_or_array.dtypes) if len(dtypes) > 1: raise ValueError("DataFrame should contain only a single dtype") dtype = dtypes[0] else: dtype = cudf_or_array.dtype meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) if isinstance(cudf_or_array, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. parts = client.sync(_extract_partitions, cudf_or_array) cudf_or_array = [ client.submit(_conv_np_to_df, part, workers=[w]) for w, part in parts ] cudf_or_array = to_dask_cudf(cudf_or_array) if isinstance(cudf_or_array, dask.dataframe.DataFrame): """ Dask.Dataframe needs special attention since it has multiple dtypes. Just use the first (and assume all the rest are the same) """ cudf_or_array = cudf_or_array.map_partitions( _conv_df_to_sp, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return cudf_or_array else: if scipy.sparse.isspmatrix(cudf_or_array): cudf_or_array = \ cupyx.scipy.sparse.csr_matrix(cudf_or_array.tocsr()) elif cupyx.scipy.sparse.isspmatrix(cudf_or_array): pass elif isinstance(cudf_or_array, cudf.DataFrame): cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, cudf_or_array, dtype=cudf_or_array.dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, cp.core.core.ndarray): cudf_or_array = cupyx.scipy.sparse.csr_matrix(cudf_or_array) else: raise ValueError("Unexpected input type %s" % type(cudf_or_array)) # Push to worker cudf_or_array = client.scatter(cudf_or_array) return dask.array.from_delayed(cudf_or_array, shape=shape, meta=meta)
def to_output(futures, type, client=None): if type == 'cupy': return to_dask_cupy(futures, client=client) else: return to_dask_cudf(futures, client=client)
def to_sparse_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) ret = cudf_or_array # If we have a Dask array, convert it to a Dask DataFrame if isinstance(ret, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. def _conv_np_to_df(x): cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype) return cudf.DataFrame.from_gpu_matrix(cupy_ary) parts = client.sync(_extract_partitions, ret) futures = [ client.submit(_conv_np_to_df, part, workers=[w], pure=False) for w, part in parts ] ret = to_dask_cudf(futures) # If we have a Dask Dataframe, use `map_partitions` to convert it # to a Sparse Cupy-backed Dask Array. This will also convert the dense # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot # use map_blocks on the array, but we can use `map_partitions` on the # Dataframe. if isinstance(ret, dask.dataframe.DataFrame): ret = ret.map_partitions(_conv_df_to_sparse, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return ret else: ret = _conv_array_to_sparse(ret) # Push to worker final_result = client.scatter(ret) return dask.array.from_delayed(final_result, shape=shape, meta=meta)