def test_product(dtype, nelem): if np.dtype(dtype).kind == 'i': data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[random.randrange(nelem)] = random.random() * 2 else: data = gen_rand(dtype, nelem) print('max', data.max(), 'min', data.min()) d_data = rmm.to_device(data) d_result = rmm.device_array(libgdf.gdf_reduce_optimal_output_size(), dtype=d_data.dtype) col_data = new_column() gdf_dtype = get_dtype(dtype) libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem, gdf_dtype) libgdf.gdf_product_generic(col_data, unwrap_devary(d_result), d_result.size) got = d_result.copy_to_host()[0] expect = np.product(data) print('expect:', expect) print('got:', got) np.testing.assert_array_almost_equal(expect, got)
def prefixsum(vals): """Compute the full prefixsum. Given the input of N. The output size is N + 1. The first value is always 0. The last value is the sum of *vals*. """ import cudf.bindings.reduce as cpp_reduce from cudf.dataframe.numerical import NumericalColumn from cudf.dataframe.buffer import Buffer # Allocate output slots = rmm.device_array(shape=vals.size + 1, dtype=vals.dtype) # Fill 0 to slot[0] gpu_fill_value[1, 1](slots[:1], 0) # Compute prefixsum on the mask in_col = NumericalColumn(data=Buffer(vals), mask=None, null_count=0, dtype=vals.dtype) out_col = NumericalColumn(data=Buffer(slots[1:]), mask=None, null_count=0, dtype=vals.dtype) cpp_reduce.apply_scan(in_col, out_col, "sum", inclusive=True) return slots
def sort(self, segments, col_keys, col_vals): seg_dtype = np.uint32 segsize_limit = 2 ** 16 - 1 d_fullsegs = rmm.device_array(segments.size + 1, dtype=seg_dtype) d_begins = d_fullsegs[:-1] d_ends = d_fullsegs[1:] # Note: .astype is required below because .copy_to_device # is just a plain memcpy d_begins.copy_to_device(cudautils.astype(segments, dtype=seg_dtype)) d_ends[-1:].copy_to_device(np.require([self.nelem], dtype=seg_dtype)) # The following is to handle the segument size limit due to # max CUDA grid size. range0 = range(0, segments.size, segsize_limit) range1 = itertools.chain(range0[1:], [segments.size]) for s, e in zip(range0, range1): segsize = e - s libgdf.gdf_segmented_radixsort_generic(self.plan, col_keys.cffi_view, col_vals.cffi_view, segsize, unwrap_devary(d_begins[s:]), unwrap_devary(d_ends[s:]))
def test_sum_of_squares(dtype, nelem): data = gen_rand(dtype, nelem) d_data = rmm.to_device(data) d_result = rmm.device_array( libgdf.gdf_reduction_get_intermediate_output_size(), dtype=d_data.dtype) col_data = new_column() gdf_dtype = get_dtype(dtype) libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem, gdf_dtype) libgdf.gdf_sum_of_squares(col_data, unwrap_devary(d_result), d_result.size) got = d_result.copy_to_host()[0] expect = (data**2).sum() print('expect:', expect) print('got:', got) if np.dtype(dtype).kind == 'i': if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) else: print('overflow, passing') else: np.testing.assert_approx_equal(expect, got, significant=accuracy_for_dtype[dtype])
def row_matrix(cols, nrow, ncol, dtype): matrix = rmm.device_array(shape=(nrow, ncol), dtype=dtype, order='C') for colidx, col in enumerate(cols): gpu_row_matrix.forall(matrix[:, colidx].size)(matrix[:, colidx], col.to_gpu_array(), nrow, ncol) return matrix
def apply_equal_constant(arr, mask, val, dtype): """Compute ``arr[mask] == val`` Parameters ---------- arr : device array data mask : device array validity mask val : scalar value to compare against dtype : np.dtype output array dtype Returns ------- result : device array """ out = rmm.device_array(shape=arr.size, dtype=dtype) if out.size > 0: if mask is not None: configured = gpu_equal_constant_masked.forall(out.size) configured(arr, mask, val, out) else: configured = gpu_equal_constant.forall(out.size) configured(arr, val, out) return out
def test_output_dtype_mismatch(): lhs_dtype = np.int32 rhs_dtype = np.int32 nelem = 5 h_lhs = np.arange(nelem, dtype=lhs_dtype) h_rhs = np.arange(nelem, dtype=rhs_dtype) d_lhs = rmm.to_device(h_lhs) d_rhs = rmm.to_device(h_rhs) d_result = rmm.device_array(d_lhs.size, dtype=np.float32) col_lhs = new_column() col_rhs = new_column() col_result = new_column() libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem, get_dtype(lhs_dtype)) libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem, get_dtype(rhs_dtype)) libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL, nelem, get_dtype(d_result.dtype)) with pytest.raises(GDFError) as raises: libgdf.gdf_add_generic(col_lhs, col_rhs, col_result) raises.match("GDF_UNSUPPORTED_DTYPE") with pytest.raises(GDFError) as raises: libgdf.gdf_eq_generic(col_lhs, col_rhs, col_result) raises.match("GDF_UNSUPPORTED_DTYPE") with pytest.raises(GDFError) as raises: libgdf.gdf_bitwise_and_generic(col_lhs, col_rhs, col_result) raises.match("GDF_UNSUPPORTED_DTYPE")
def gather(data, index, out=None): """Perform ``out = data[index]`` on the GPU """ if out is None: out = rmm.device_array(shape=index.size, dtype=data.dtype) gpu_gather.forall(index.size)(data, index, out) return out
def __init__(self, data, null_count=None, **kwargs): """ Parameters ---------- data : nvstrings.nvstrings The nvstrings object null_count : int; optional The number of null values in the mask. """ from collections.abc import Sequence if isinstance(data, Sequence): data = nvstrings.to_device(data) assert isinstance(data, nvstrings.nvstrings) self._data = data self._dtype = np.dtype("object") if null_count is None: null_count = data.null_count() self._null_count = null_count self._mask = None if self._null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype='int8') out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) self._mask = Buffer(out_mask_arr) self._nvcategory = None self._indices = None
def find_segments(arr, segs=None, markers=None): """Find beginning indices of runs of equal values. Parameters ---------- arr : device array The operand. segs : optional; device array Segment offsets that must exist in the output. Returns ------- starting_indices : device array The starting indices of start of segments. Total segment count will be equal to the length of this. """ # Compute diffs of consecutive elements null_markers = markers is None if null_markers: markers = zeros(arr.size, dtype=np.int32) else: assert markers.size == arr.size assert markers.dtype == np.dtype(np.int32), markers.dtype gpu_mark_segment_begins.forall(markers.size)(arr, markers) if segs is not None and null_markers: gpu_mark_seg_segments.forall(segs.size)(segs, markers) # Compute index of marked locations slots = prefixsum(markers) ct = slots[slots.size - 1] scanned = slots[:-1] # Compact segments begins = rmm.device_array(shape=int(ct), dtype=np.intp) gpu_scatter_segment_begins.forall(markers.size)(markers, scanned, begins) return begins, markers
def logical_op_test(dtype, expect_fn, test_fn, nelem=128, gdf_dtype=None): h_lhs = gen_rand(dtype, nelem) h_rhs = gen_rand(dtype, nelem) d_lhs = rmm.to_device(h_lhs) d_rhs = rmm.to_device(h_rhs) d_result = rmm.device_array(d_lhs.size, dtype=np.bool) col_lhs = new_column() col_rhs = new_column() col_result = new_column() gdf_dtype = get_dtype(dtype) if gdf_dtype is None else gdf_dtype libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem, gdf_dtype) libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem, gdf_dtype) libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL, nelem, libgdf.GDF_INT8) expect = expect_fn(h_lhs, h_rhs) test_fn(col_lhs, col_rhs, col_result) got = d_result.copy_to_host() print(expect, got) np.testing.assert_equal(expect, got)
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def test_prefixsum(dtype, nelem): if dtype == np.int8: # to keep data in range data = gen_rand(dtype, nelem, low=-2, high=2) else: data = gen_rand(dtype, nelem) d_data = rmm.to_device(data) d_result = rmm.device_array(d_data.size, dtype=d_data.dtype) col_data = new_column() gdf_dtype = get_dtype(dtype) libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem, gdf_dtype) col_result = new_column() libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL, nelem, gdf_dtype) inclusive = True libgdf.gdf_prefixsum(col_data, col_result, inclusive) expect = np.cumsum(d_data.copy_to_host()) got = d_result.copy_to_host() if not inclusive: expect = expect[:-1] assert got[0] == 0 got = got[1:] decimal = 4 if dtype == np.float32 else 6 np.testing.assert_array_almost_equal(expect, got, decimal=decimal)
def test_strings_counts(): strs = nvstrings.to_device( ["apples are green", "apples are a fruit", None, ""] ) query_strings = nvstrings.to_device(['pl', 're']) # host results contains_outcome = nvtext.strings_counts(strs, query_strings) expected = [ [1, 2], [1, 1], [0, 0], [0, 0] ] assert contains_outcome == expected # device results outcome_darray = rmm.device_array((strs.size(), query_strings.size()), dtype=np.int32) nvtext.strings_counts(strs, query_strings, devptr=outcome_darray.device_ctypes_pointer.value) assert np.array_equal(outcome_darray.copy_to_host(), expected)
def test_contains_strings(): strs = nvstrings.to_device( ["apples are green", "apples are a fruit", None, ""] ) query_strings = nvstrings.to_device(['apple', 'fruit']) # host results contains_outcome = nvtext.contains_strings(strs, query_strings) expected = [ [True, False], [True, True], [False, False], [False, False] ] assert contains_outcome == expected # device results outcome_darray = rmm.device_array((strs.size(), query_strings.size()), dtype=np.bool) nvtext.contains_strings(strs, query_strings, devptr=outcome_darray.device_ctypes_pointer.value) assert np.array_equal(outcome_darray.copy_to_host(), expected)
def on_gpu(words, func, arg=None, dtype=np.int32): res = librmm.device_array(words.size(), dtype=dtype) if arg is None: cmd = 'words.%s(res.device_ctypes_pointer.value)' % (func) else: cmd = 'words.%s(arg,res.device_ctypes_pointer.value)' % (func) eval(cmd) return res
def as_column(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) else: vals = rmm.device_array(0, dtype=self.dtype) return NumericalColumn(data=Buffer(vals), dtype=vals.dtype, name=self.name)
def expand_mask_bits(size, bits): """Expand bit-mask into byte-mask """ expanded_mask = rmm.device_array(size, dtype=np.int32) numtasks = min(1024, expanded_mask.size) if numtasks > 0: gpu_expand_mask_bits.forall(numtasks)(bits, expanded_mask) return expanded_mask
def row_matrix(cols, nrow, ncol, dtype): matrix = rmm.device_array(shape=(nrow, ncol), dtype=dtype, order="C") for colidx, col in enumerate(cols): data = matrix[:, colidx] if data.size > 0: gpu_row_matrix.forall(data.size)(data, col.to_gpu_array(), nrow, ncol) return matrix
def indices(self): if self._indices is None: out_dev_arr = rmm.device_array(self.nvcategory.size(), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self.nvcategory.values(devptr=ptr) self._indices = Buffer(out_dev_arr) return self._indices
def apply_reduce(fn, inp): # allocate output+temp array outsz = libgdf.gdf_reduce_optimal_output_size() out = rmm.device_array(outsz, dtype=inp.dtype) # call reduction fn(inp.cffi_view, unwrap_devary(out), outsz) # return 1st element return out[0]
def row_matrix(df): """Compute the C (row major) version gpu matrix of df This implements the algorithm documented in http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device its stream will be used to perform the transpose (and to copy `b` to the device if necessary). Adapted from numba: https://github.com/numba/numba/blob/master/numba/cuda/kernels/transpose.py To be replaced by CUDA ml-prim in upcoming version """ cols = [df._cols[k] for k in df._cols] ncol = len(cols) nrow = len(df) dtype = cols[0].dtype a = df.as_gpu_matrix(order='F') b = rmm.device_array((nrow, ncol), dtype=dtype, order='C') dtype = numba.typeof(a) tpb = driver.get_device().MAX_THREADS_PER_BLOCK tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) @cuda.jit def kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=numba.float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y y = by + tx x = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # one block per tile, plus one for remainders blocks = int((b.shape[1]) / tile_height + 1), int((b.shape[0]) / tile_width + 1) # one thread per tile element threads = tile_height, tile_width kernel[blocks, threads](a, b) return b
def column_empty_like(column, dtype, masked): """Allocate a new column like the given *column* """ data = rmm.device_array(shape=len(column), dtype=dtype) params = dict(data=Buffer(data)) if masked: mask = utils.make_mask(data.size) params.update(dict(mask=Buffer(mask), null_count=data.size)) return Column(**params)
def scalar_broadcast_to(scalar, shape, dtype): from .cudautils import fill_value if not isinstance(shape, tuple): shape = (shape, ) da = rmm.device_array(shape, dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def column_hash_values(column0, *other_columns): """Hash all values in the given columns. Returns a new NumericalColumn[int32] """ columns = [column0] + list(other_columns) buf = Buffer(rmm.device_array(len(column0), dtype=np.int32)) result = NumericalColumn(data=buf, dtype=buf.dtype) _gdf.hash_columns(columns, result) return result
def allocate_mask(self, all_valid=True): """Return a new Column with a newly allocated mask buffer. If ``all_valid`` is True, the new mask is set to all valid. If ``all_valid`` is False, the new mask is set to all null. """ nelem = len(self) mask_sz = utils.calc_chunk_size(nelem, utils.mask_bitsize) mask = rmm.device_array(mask_sz, dtype=utils.mask_dtype) if nelem > 0: cudautils.fill_value(mask, 0xff if all_valid else 0) return self.set_mask(mask=mask, null_count=0 if all_valid else nelem)
def column_hash_values(column0, *other_columns, initial_hash_values=None): """Hash all values in the given columns. Returns a new NumericalColumn[int32] """ columns = [column0] + list(other_columns) buf = Buffer(rmm.device_array(len(column0), dtype=np.int32)) result = NumericalColumn(data=buf, dtype=buf.dtype) if initial_hash_values: initial_hash_values = rmm.to_device(initial_hash_values) cpp_hash.hash_columns(columns, result, initial_hash_values) return result
def arange(start, stop=None, step=1, dtype=np.int64): if stop is None: start, stop = 0, start size = (stop - start + (step - 1)) // step if size <= 0: msgfmt = "size={size} in arange({start}, {stop}, {step}, {dtype})" raise ValueError(msgfmt.format(size=size, start=start, stop=stop, step=step, dtype=dtype)) out = rmm.device_array(size, dtype=dtype) gpu_arange.forall(size)(start, size, step, out) return out
def mask_assign_slot(size, mask): # expand bits into bytes dtype = (np.int32 if size < 2 ** 31 else np.int64) expanded_mask = rmm.device_array(size, dtype=dtype) numtasks = min(64 * 128, expanded_mask.size) if numtasks > 0: gpu_expand_mask_bits.forall(numtasks)(mask, expanded_mask) # compute prefixsum slots = prefixsum(expanded_mask) sz = int(slots[slots.size - 1]) return slots, sz
def column_empty_like_same_mask(column, dtype): """Create a new empty Column with the same length and the same mask. Parameters ---------- dtype : np.dtype like The dtype of the data buffer. """ data = rmm.device_array(shape=len(column), dtype=dtype) params = dict(data=Buffer(data)) if column.has_null_mask: params.update(mask=column.nullmask) return Column(**params)