def make_mask(size): """Create mask to obtain at least *size* number of bits. """ size = calc_chunk_size(size, mask_bitsize) return rmm.device_array(shape=size, dtype=mask_dtype)
Notes ----- The parameters `case`, `flags`, and `na` are not yet supported and will raise a NotImplementedError if anything other than the default value is set. """ if case is not True: raise NotImplementedError("`case` parameter is not yet supported") elif flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") elif na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") from cudf.core import Series out_dev_arr = rmm.device_array(len(self._parent), dtype="bool") ptr = libcudf.cudf.get_ctype_ptr(out_dev_arr) self._parent.nvstrings.contains(pat, regex=regex, devptr=ptr) mask = None if self._parent.has_nulls: mask = self._parent.mask col = column.build_column( Buffer(out_dev_arr), dtype=np.dtype("bool"), mask=mask ) return Series(col, index=self._index, name=self._name) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): """
def _agg_groups(self, functors): """Aggregate the groups Parameters ---------- functors: dict Contains key for column names and value for list of functors. """ functors_mapping = OrderedDict() # The "value" columns for k, vs in functors.items(): if k not in self._df.columns: raise NameError("column {} not found".format(k)) if len(vs) == 1: [functor] = vs functors_mapping[k] = {k: functor} else: functors_mapping[k] = cur_fn_mapping = OrderedDict() for functor in vs: newk = "{}_{}".format(k, functor.__name__) cur_fn_mapping[newk] = functor del functor # Grouping grouped_df, sr_segs = self._group_dataframe(self._df, self._by) # Grouped values outdf = cudf.DataFrame() segs = sr_segs.to_array() for k in self._by: outdf[k] = grouped_df[k].take(sr_segs).reset_index(drop=True) size = len(outdf) # Append value columns for k, infos in functors_mapping.items(): values = defaultdict(lambda: np.zeros(size, dtype=np.float64)) begin = segs sr = grouped_df[k].reset_index(drop=True) for newk, functor in infos.items(): if functor.__name__ == "mean": dev_begins = rmm.to_device(np.asarray(begin)) dev_out = rmm.device_array(size, dtype=np.float64) if size > 0: group_mean.forall(size)( sr._column.data_array_view, dev_begins, dev_out ) values[newk] = dev_out elif functor.__name__ == "max": dev_begins = rmm.to_device(np.asarray(begin)) dev_out = rmm.device_array(size, dtype=sr.dtype) if size > 0: group_max.forall(size)( sr._column.data_array_view, dev_begins, dev_out ) values[newk] = dev_out elif functor.__name__ == "min": dev_begins = rmm.to_device(np.asarray(begin)) dev_out = rmm.device_array(size, dtype=sr.dtype) if size > 0: group_min.forall(size)( sr._column.data_array_view, dev_begins, dev_out ) values[newk] = dev_out else: end = chain(segs[1:], [len(grouped_df)]) for i, (s, e) in enumerate(zip(begin, end)): values[newk][i] = functor(sr[s:e]) # Store for k, buf in values.items(): outdf[k] = buf return outdf
def cuda_array(size): return rmm.device_array(size, dtype=np.uint8)
def null(cls, dtype): """Create a "null" buffer with a zero-sized device array. """ mem = rmm.device_array(0, dtype=dtype) return cls(mem, size=0, capacity=0)
os.environ.setdefault("UCX_RNDV_SCHEME", "put_zcopy") os.environ.setdefault("UCX_MEMTYPE_CACHE", "n") os.environ.setdefault("UCX_TLS", "tcp,rc,cuda_copy,cuda_ipc") logger = logging.getLogger(__name__) MAX_MSG_LOG = 23 # ---------------------------------------------------------------------------- # Comm Interface # ---------------------------------------------------------------------------- # Let's find the function, `cuda_array`, to use when allocating new CUDA arrays try: import rmm cuda_array = lambda n: rmm.device_array(n, dtype=np.uint8) except ImportError: try: import numba.cuda cuda_array = lambda n: numba.cuda.device_array((n, ), dtype=np.uint8) except ImportError: def cuda_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required") class UCX(Comm): """Comm object using UCP.
def gpu_major_converter(original, nrows, ncols, dtype, to_order='C'): row_major = rmm.device_array((nrows, ncols), dtype=dtype, order=to_order) tpb = driver.get_device().MAX_THREADS_PER_BLOCK tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) # blocks and threads for the shared memory/tiled algorithm # see http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ # noqa blocks = int((row_major.shape[1]) / tile_height + 1), \ int((row_major.shape[0]) / tile_width + 1) threads = tile_height, tile_width # blocks per gpu for the general kernel bpg = (nrows + tpb - 1) // tpb if dtype == 'float32': dev_dtype = numba.float32 else: dev_dtype = numba.float64 @cuda.jit def general_kernel(input, output): tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if tid >= nrows: return _col_offset = 0 while _col_offset < input.shape[1]: col_idx = _col_offset output[tid, col_idx] = input[tid, col_idx] _col_offset += 1 @cuda.jit def shared_kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dev_dtype) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y y = by + tx x = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # check if we cannot call the shared memory kernel # block limits: 2**31-1 for x, 65535 for y dim of blocks if blocks[0] > 2147483647 or blocks[1] > 65535: general_kernel[bpg, tpb](original, row_major) else: shared_kernel[blocks, threads](original, row_major) return row_major
def window_sizes_from_offset(arr, offset): window_sizes = rmm.device_array(shape=(arr.shape), dtype="int32") if arr.size > 0: gpu_window_sizes_from_offset.forall(arr.size)(arr, window_sizes, offset) return window_sizes
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories)._column assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range(self, value, key_start, key_stop, 0) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}") raise self._mimic_inplace(out, inplace=True)
def modulo(arr, d): """Array element modulo operator""" out = rmm.device_array(shape=arr.shape, dtype=arr.dtype) if arr.size > 0: gpu_modulo.forall(arr.size)(arr, out, d) return out
def zeros(size, dtype): out = rmm.device_array(size, dtype=dtype) if size > 0: gpu_zeros.forall(size)(size, out) return out
def arange_reversed(size, dtype=np.int32): out = rmm.device_array(size, dtype=dtype) if size > 0: gpu_arange_reversed.forall(size)(size, out) return out
def compute_scale(arr, vmin, vmax): out = rmm.device_array(shape=arr.size, dtype=np.float64) if out.size > 0: configured = gpu_scale.forall(out.size) configured(arr, vmin, vmax, out) return out
def full(size, value, dtype): out = rmm.device_array(size, dtype=dtype) fill_value(out, value) return out
def rmm_device_array(n): a = rmm.device_array(n, dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a
def as_column(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) else: vals = rmm.device_array(0, dtype=self.dtype) return column.build_column(data=Buffer(vals), dtype=vals.dtype)
def as_contiguous(arr): assert arr.ndim == 1 out = rmm.device_array(shape=arr.shape, dtype=arr.dtype) return copy_array(arr, out=out)
def apply_binarize(in_col, width): out = rmm.device_array((in_col.size, width), dtype="int8") if out.size > 0: out[:] = 0 binarize.forall(out.size)(in_col, out, width) return out