Exemple #1
0
def test_product(dtype, nelem):
    if np.dtype(dtype).kind == 'i':
        data = np.ones(nelem, dtype=dtype)
        # Set at most 30 items to [0..2) to keep the value within 2^32
        for _ in range(30):
            data[random.randrange(nelem)] = random.random() * 2
    else:
        data = gen_rand(dtype, nelem)

    print('max', data.max(), 'min', data.min())
    d_data = rmm.to_device(data)
    d_result = rmm.device_array(libgdf.gdf_reduce_optimal_output_size(),
                                dtype=d_data.dtype)

    col_data = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           gdf_dtype)

    libgdf.gdf_product_generic(col_data, unwrap_devary(d_result),
                               d_result.size)
    got = d_result.copy_to_host()[0]
    expect = np.product(data)

    print('expect:', expect)
    print('got:', got)

    np.testing.assert_array_almost_equal(expect, got)
Exemple #2
0
def prefixsum(vals):
    """Compute the full prefixsum.

    Given the input of N.  The output size is N + 1.
    The first value is always 0.  The last value is the sum of *vals*.
    """

    import cudf.bindings.reduce as cpp_reduce
    from cudf.dataframe.numerical import NumericalColumn
    from cudf.dataframe.buffer import Buffer

    # Allocate output
    slots = rmm.device_array(shape=vals.size + 1, dtype=vals.dtype)
    # Fill 0 to slot[0]
    gpu_fill_value[1, 1](slots[:1], 0)

    # Compute prefixsum on the mask
    in_col = NumericalColumn(data=Buffer(vals),
                             mask=None,
                             null_count=0,
                             dtype=vals.dtype)
    out_col = NumericalColumn(data=Buffer(slots[1:]),
                              mask=None,
                              null_count=0,
                              dtype=vals.dtype)
    cpp_reduce.apply_scan(in_col, out_col, "sum", inclusive=True)
    return slots
Exemple #3
0
    def sort(self, segments, col_keys, col_vals):
        seg_dtype = np.uint32
        segsize_limit = 2 ** 16 - 1

        d_fullsegs = rmm.device_array(segments.size + 1, dtype=seg_dtype)
        d_begins = d_fullsegs[:-1]
        d_ends = d_fullsegs[1:]

        # Note: .astype is required below because .copy_to_device
        #       is just a plain memcpy
        d_begins.copy_to_device(cudautils.astype(segments, dtype=seg_dtype))
        d_ends[-1:].copy_to_device(np.require([self.nelem], dtype=seg_dtype))

        # The following is to handle the segument size limit due to
        # max CUDA grid size.
        range0 = range(0, segments.size, segsize_limit)
        range1 = itertools.chain(range0[1:], [segments.size])
        for s, e in zip(range0, range1):
            segsize = e - s
            libgdf.gdf_segmented_radixsort_generic(self.plan,
                                                   col_keys.cffi_view,
                                                   col_vals.cffi_view,
                                                   segsize,
                                                   unwrap_devary(d_begins[s:]),
                                                   unwrap_devary(d_ends[s:]))
Exemple #4
0
def test_sum_of_squares(dtype, nelem):
    data = gen_rand(dtype, nelem)
    d_data = rmm.to_device(data)
    d_result = rmm.device_array(
        libgdf.gdf_reduction_get_intermediate_output_size(),
        dtype=d_data.dtype)

    col_data = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           gdf_dtype)

    libgdf.gdf_sum_of_squares(col_data, unwrap_devary(d_result), d_result.size)
    got = d_result.copy_to_host()[0]
    expect = (data**2).sum()

    print('expect:', expect)
    print('got:', got)

    if np.dtype(dtype).kind == 'i':
        if 0 <= expect <= np.iinfo(dtype).max:
            np.testing.assert_array_almost_equal(expect, got)
        else:
            print('overflow, passing')
    else:
        np.testing.assert_approx_equal(expect,
                                       got,
                                       significant=accuracy_for_dtype[dtype])
Exemple #5
0
def row_matrix(cols, nrow, ncol, dtype):
    matrix = rmm.device_array(shape=(nrow, ncol), dtype=dtype, order='C')
    for colidx, col in enumerate(cols):
        gpu_row_matrix.forall(matrix[:, colidx].size)(matrix[:, colidx],
                                                      col.to_gpu_array(),
                                                      nrow, ncol)
    return matrix
Exemple #6
0
def apply_equal_constant(arr, mask, val, dtype):
    """Compute ``arr[mask] == val``

    Parameters
    ----------
    arr : device array
        data
    mask : device array
        validity mask
    val : scalar
        value to compare against
    dtype : np.dtype
        output array dtype

    Returns
    -------
    result : device array
    """
    out = rmm.device_array(shape=arr.size, dtype=dtype)
    if out.size > 0:
        if mask is not None:
            configured = gpu_equal_constant_masked.forall(out.size)
            configured(arr, mask, val, out)
        else:
            configured = gpu_equal_constant.forall(out.size)
            configured(arr, val, out)
    return out
Exemple #7
0
def test_output_dtype_mismatch():
    lhs_dtype = np.int32
    rhs_dtype = np.int32
    nelem = 5
    h_lhs = np.arange(nelem, dtype=lhs_dtype)
    h_rhs = np.arange(nelem, dtype=rhs_dtype)

    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array(d_lhs.size, dtype=np.float32)

    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem,
                           get_dtype(lhs_dtype))
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem,
                           get_dtype(rhs_dtype))
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, get_dtype(d_result.dtype))

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_add_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_eq_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_bitwise_and_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")
Exemple #8
0
def gather(data, index, out=None):
    """Perform ``out = data[index]`` on the GPU
    """
    if out is None:
        out = rmm.device_array(shape=index.size, dtype=data.dtype)
    gpu_gather.forall(index.size)(data, index, out)
    return out
Exemple #9
0
    def __init__(self, data, null_count=None, **kwargs):
        """
        Parameters
        ----------
        data : nvstrings.nvstrings
            The nvstrings object
        null_count : int; optional
            The number of null values in the mask.
        """
        from collections.abc import Sequence
        if isinstance(data, Sequence):
            data = nvstrings.to_device(data)
        assert isinstance(data, nvstrings.nvstrings)
        self._data = data
        self._dtype = np.dtype("object")

        if null_count is None:
            null_count = data.null_count()
        self._null_count = null_count
        self._mask = None
        if self._null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype='int8')
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            self._mask = Buffer(out_mask_arr)
        self._nvcategory = None
        self._indices = None
Exemple #10
0
def find_segments(arr, segs=None, markers=None):
    """Find beginning indices of runs of equal values.

    Parameters
    ----------
    arr : device array
        The operand.
    segs : optional; device array
        Segment offsets that must exist in the output.

    Returns
    -------
    starting_indices : device array
        The starting indices of start of segments.
        Total segment count will be equal to the length of this.

    """
    # Compute diffs of consecutive elements
    null_markers = markers is None
    if null_markers:
        markers = zeros(arr.size, dtype=np.int32)
    else:
        assert markers.size == arr.size
        assert markers.dtype == np.dtype(np.int32), markers.dtype
    gpu_mark_segment_begins.forall(markers.size)(arr, markers)
    if segs is not None and null_markers:
        gpu_mark_seg_segments.forall(segs.size)(segs, markers)
    # Compute index of marked locations
    slots = prefixsum(markers)
    ct = slots[slots.size - 1]
    scanned = slots[:-1]
    # Compact segments
    begins = rmm.device_array(shape=int(ct), dtype=np.intp)
    gpu_scatter_segment_begins.forall(markers.size)(markers, scanned, begins)
    return begins, markers
Exemple #11
0
def logical_op_test(dtype, expect_fn, test_fn, nelem=128, gdf_dtype=None):
    h_lhs = gen_rand(dtype, nelem)
    h_rhs = gen_rand(dtype, nelem)
    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array(d_lhs.size, dtype=np.bool)

    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()
    gdf_dtype = get_dtype(dtype) if gdf_dtype is None else gdf_dtype

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, libgdf.GDF_INT8)

    expect = expect_fn(h_lhs, h_rhs)
    test_fn(col_lhs, col_rhs, col_result)

    got = d_result.copy_to_host()
    print(expect, got)
    np.testing.assert_equal(expect, got)
Exemple #12
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
Exemple #13
0
def test_prefixsum(dtype, nelem):
    if dtype == np.int8:
        # to keep data in range
        data = gen_rand(dtype, nelem, low=-2, high=2)
    else:
        data = gen_rand(dtype, nelem)
    d_data = rmm.to_device(data)
    d_result = rmm.device_array(d_data.size, dtype=d_data.dtype)

    col_data = new_column()
    gdf_dtype = get_dtype(dtype)
    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           gdf_dtype)

    col_result = new_column()
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, gdf_dtype)

    inclusive = True
    libgdf.gdf_prefixsum(col_data, col_result, inclusive)

    expect = np.cumsum(d_data.copy_to_host())
    got = d_result.copy_to_host()
    if not inclusive:
        expect = expect[:-1]
        assert got[0] == 0
        got = got[1:]

    decimal = 4 if dtype == np.float32 else 6
    np.testing.assert_array_almost_equal(expect, got, decimal=decimal)
Exemple #14
0
def test_strings_counts():
    strs = nvstrings.to_device(
        ["apples are green",
         "apples are a fruit",
         None,
         ""]
    )

    query_strings = nvstrings.to_device(['pl', 're'])

    # host results
    contains_outcome = nvtext.strings_counts(strs, query_strings)
    expected = [
        [1, 2],
        [1, 1],
        [0, 0],
        [0, 0]
    ]
    assert contains_outcome == expected

    # device results
    outcome_darray = rmm.device_array((strs.size(), query_strings.size()),
                                      dtype=np.int32)
    nvtext.strings_counts(strs, query_strings,
                          devptr=outcome_darray.device_ctypes_pointer.value)
    assert np.array_equal(outcome_darray.copy_to_host(), expected)
Exemple #15
0
def test_contains_strings():
    strs = nvstrings.to_device(
        ["apples are green",
         "apples are a fruit",
         None,
         ""]
    )

    query_strings = nvstrings.to_device(['apple', 'fruit'])

    # host results
    contains_outcome = nvtext.contains_strings(strs, query_strings)
    expected = [
        [True, False],
        [True, True],
        [False, False],
        [False, False]
    ]
    assert contains_outcome == expected

    # device results
    outcome_darray = rmm.device_array((strs.size(), query_strings.size()),
                                      dtype=np.bool)
    nvtext.contains_strings(strs, query_strings,
                            devptr=outcome_darray.device_ctypes_pointer.value)
    assert np.array_equal(outcome_darray.copy_to_host(), expected)
Exemple #16
0
def on_gpu(words, func, arg=None, dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(res.device_ctypes_pointer.value)' % (func)
    else:
        cmd = 'words.%s(arg,res.device_ctypes_pointer.value)' % (func)
    eval(cmd)
    return res
Exemple #17
0
 def as_column(self):
     if len(self) > 0:
         vals = cudautils.arange(self._start, self._stop, dtype=self.dtype)
     else:
         vals = rmm.device_array(0, dtype=self.dtype)
     return NumericalColumn(data=Buffer(vals),
                            dtype=vals.dtype,
                            name=self.name)
Exemple #18
0
def expand_mask_bits(size, bits):
    """Expand bit-mask into byte-mask
    """
    expanded_mask = rmm.device_array(size, dtype=np.int32)
    numtasks = min(1024, expanded_mask.size)
    if numtasks > 0:
        gpu_expand_mask_bits.forall(numtasks)(bits, expanded_mask)
    return expanded_mask
Exemple #19
0
def row_matrix(cols, nrow, ncol, dtype):
    matrix = rmm.device_array(shape=(nrow, ncol), dtype=dtype, order="C")
    for colidx, col in enumerate(cols):
        data = matrix[:, colidx]
        if data.size > 0:
            gpu_row_matrix.forall(data.size)(data, col.to_gpu_array(), nrow,
                                             ncol)
    return matrix
Exemple #20
0
 def indices(self):
     if self._indices is None:
         out_dev_arr = rmm.device_array(self.nvcategory.size(),
                                        dtype='int32')
         ptr = get_ctype_ptr(out_dev_arr)
         self.nvcategory.values(devptr=ptr)
         self._indices = Buffer(out_dev_arr)
     return self._indices
Exemple #21
0
def apply_reduce(fn, inp):
    # allocate output+temp array
    outsz = libgdf.gdf_reduce_optimal_output_size()
    out = rmm.device_array(outsz, dtype=inp.dtype)
    # call reduction
    fn(inp.cffi_view, unwrap_devary(out), outsz)
    # return 1st element
    return out[0]
Exemple #22
0
def row_matrix(df):
    """Compute the C (row major) version gpu matrix of df

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).

    Adapted from numba:
    https://github.com/numba/numba/blob/master/numba/cuda/kernels/transpose.py

    To be replaced by CUDA ml-prim in upcoming version
    """

    cols = [df._cols[k] for k in df._cols]
    ncol = len(cols)
    nrow = len(df)
    dtype = cols[0].dtype

    a = df.as_gpu_matrix(order='F')
    b = rmm.device_array((nrow, ncol), dtype=dtype, order='C')
    dtype = numba.typeof(a)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK

    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
    tile_height = int(tpb / tile_width)

    tile_shape = (tile_height, tile_width + 1)

    @cuda.jit
    def kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=numba.float32)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        y = by + tx
        x = bx + ty

        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
            tile[ty, tx] = input[by + ty, bx + tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

    # one block per tile, plus one for remainders
    blocks = int((b.shape[1]) / tile_height +
                 1), int((b.shape[0]) / tile_width + 1)
    # one thread per tile element
    threads = tile_height, tile_width
    kernel[blocks, threads](a, b)

    return b
Exemple #23
0
def column_empty_like(column, dtype, masked):
    """Allocate a new column like the given *column*
    """
    data = rmm.device_array(shape=len(column), dtype=dtype)
    params = dict(data=Buffer(data))
    if masked:
        mask = utils.make_mask(data.size)
        params.update(dict(mask=Buffer(mask), null_count=data.size))
    return Column(**params)
Exemple #24
0
def scalar_broadcast_to(scalar, shape, dtype):
    from .cudautils import fill_value

    if not isinstance(shape, tuple):
        shape = (shape, )
    da = rmm.device_array(shape, dtype=dtype)
    if da.size != 0:
        fill_value(da, scalar)
    return da
Exemple #25
0
def column_hash_values(column0, *other_columns):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    columns = [column0] + list(other_columns)
    buf = Buffer(rmm.device_array(len(column0), dtype=np.int32))
    result = NumericalColumn(data=buf, dtype=buf.dtype)
    _gdf.hash_columns(columns, result)
    return result
Exemple #26
0
 def allocate_mask(self, all_valid=True):
     """Return a new Column with a newly allocated mask buffer.
     If ``all_valid`` is True, the new mask is set to all valid.
     If ``all_valid`` is False, the new mask is set to all null.
     """
     nelem = len(self)
     mask_sz = utils.calc_chunk_size(nelem, utils.mask_bitsize)
     mask = rmm.device_array(mask_sz, dtype=utils.mask_dtype)
     if nelem > 0:
         cudautils.fill_value(mask, 0xff if all_valid else 0)
     return self.set_mask(mask=mask, null_count=0 if all_valid else nelem)
Exemple #27
0
def column_hash_values(column0, *other_columns, initial_hash_values=None):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    columns = [column0] + list(other_columns)
    buf = Buffer(rmm.device_array(len(column0), dtype=np.int32))
    result = NumericalColumn(data=buf, dtype=buf.dtype)
    if initial_hash_values:
        initial_hash_values = rmm.to_device(initial_hash_values)
    cpp_hash.hash_columns(columns, result, initial_hash_values)
    return result
Exemple #28
0
def arange(start, stop=None, step=1, dtype=np.int64):
    if stop is None:
        start, stop = 0, start
    size = (stop - start + (step - 1)) // step
    if size <= 0:
        msgfmt = "size={size} in arange({start}, {stop}, {step}, {dtype})"
        raise ValueError(msgfmt.format(size=size, start=start, stop=stop,
                                       step=step, dtype=dtype))
    out = rmm.device_array(size, dtype=dtype)
    gpu_arange.forall(size)(start, size, step, out)
    return out
Exemple #29
0
def mask_assign_slot(size, mask):
    # expand bits into bytes
    dtype = (np.int32 if size < 2 ** 31 else np.int64)
    expanded_mask = rmm.device_array(size, dtype=dtype)
    numtasks = min(64 * 128, expanded_mask.size)
    if numtasks > 0:
        gpu_expand_mask_bits.forall(numtasks)(mask, expanded_mask)

    # compute prefixsum
    slots = prefixsum(expanded_mask)
    sz = int(slots[slots.size - 1])
    return slots, sz
Exemple #30
0
def column_empty_like_same_mask(column, dtype):
    """Create a new empty Column with the same length and the same mask.

    Parameters
    ----------
    dtype : np.dtype like
        The dtype of the data buffer.
    """
    data = rmm.device_array(shape=len(column), dtype=dtype)
    params = dict(data=Buffer(data))
    if column.has_null_mask:
        params.update(mask=column.nullmask)
    return Column(**params)