Example #1
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.dataframe.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if buffers[0]:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.array(buffers[0]).view('int8'))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    if dtype:
        new_dtype = dtype
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            new_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            new_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset +
                                                 len(pa_arr)])
    else:
        padata = Buffer(np.empty(0, dtype=new_dtype))
    return (pamask, padata)
Example #2
0
def column_empty(row_count, dtype, masked, categories=None):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)
    if masked:
        mask = cudautils.make_empty_mask(row_count)
    else:
        mask = None

    if categories is None and is_categorical_dtype(dtype):
        categories = [] if dtype.categories is None else dtype.categories

    if categories is not None:
        dtype = min_scalar_type(len(categories))
        mem = rmm.device_array((row_count, ), dtype=dtype)
        data = Buffer(mem)
        dtype = "category"
    elif dtype.kind in "OU":
        if row_count == 0:
            data = nvstrings.to_device([])
        else:
            mem = rmm.device_array((row_count, ), dtype="float64")
            data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True)
    else:
        mem = rmm.device_array((row_count, ), dtype=dtype)
        data = Buffer(mem)

    if mask is not None:
        mask = Buffer(mask)

    from cudf.dataframe.columnops import build_column

    return build_column(data, dtype, mask, categories)
Example #3
0
    def from_mem_views(data_mem, mask_mem=None, null_count=None, name=None):
        """Create a Column object from a data device array (or nvstrings
           object), and an optional mask device array
        """
        from cudf.dataframe import columnops

        if isinstance(data_mem, nvstrings.nvstrings):
            return columnops.build_column(
                name=name,
                buffer=data_mem,
                dtype=np.dtype("object"),
                null_count=null_count,
            )
        else:
            data_buf = Buffer(data_mem)
            mask = None
            if mask_mem is not None:
                mask = Buffer(mask_mem)
            return columnops.build_column(
                name=name,
                buffer=data_buf,
                dtype=data_mem.dtype,
                mask=mask,
                null_count=null_count,
            )
Example #4
0
    def take(self, indices, ignore_index=False):
        """Return Series by taking values from the corresponding *indices*.
        """
        indices = Buffer(indices).to_gpu_array()
        # Handle zero size
        if indices.size == 0:
            return self._copy_construct(data=self.data[:0],
                                        index=self.index[:0])

        if self.dtype == np.dtype("object"):
            return self[indices]

        data = cudautils.gather(data=self.data.to_gpu_array(), index=indices)

        if self._column.mask:
            mask = self._get_mask_as_series().take(indices).as_mask()
            mask = Buffer(mask)
        else:
            mask = None
        if ignore_index:
            index = RangeIndex(indices.size)
        else:
            index = self.index.take(indices)

        col = self._column.replace(data=Buffer(data), mask=mask)
        return self._copy_construct(data=col, index=index)
Example #5
0
def _mask_from_cuda_array_interface_desc(desc):
    from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize
    from cudf.utils.cudautils import compact_mask_bytes

    mask = desc.get("mask", None)

    if mask is not None:
        desc = mask.__cuda_array_interface__
        ptr = desc["data"][0]
        nelem = desc["shape"][0]
        typestr = desc["typestr"]
        typecode = typestr[1]
        if typecode == "t":
            mask = rmm.device_array_from_ptr(
                ptr,
                nelem=calc_chunk_size(nelem, mask_bitsize),
                dtype=mask_dtype,
                finalizer=None,
            )
            mask = Buffer(mask)
        elif typecode == "b":
            dtype = np.dtype(typestr)
            mask = compact_mask_bytes(
                rmm.device_array_from_ptr(ptr,
                                          nelem=nelem,
                                          dtype=dtype,
                                          finalizer=None))
            mask = Buffer(mask)
        else:
            raise NotImplementedError(
                f"Cannot infer mask from typestr {typestr}")
    return mask
Example #6
0
def prefixsum(vals):
    """Compute the full prefixsum.

    Given the input of N.  The output size is N + 1.
    The first value is always 0.  The last value is the sum of *vals*.
    """

    import cudf.bindings.reduce as cpp_reduce
    from cudf.dataframe.numerical import NumericalColumn
    from cudf.dataframe.buffer import Buffer

    # Allocate output
    slots = rmm.device_array(shape=vals.size + 1, dtype=vals.dtype)
    # Fill 0 to slot[0]
    gpu_fill_value[1, 1](slots[:1], 0)

    # Compute prefixsum on the mask
    in_col = NumericalColumn(data=Buffer(vals),
                             mask=None,
                             null_count=0,
                             dtype=vals.dtype)
    out_col = NumericalColumn(data=Buffer(slots[1:]),
                              mask=None,
                              null_count=0,
                              dtype=vals.dtype)
    cpp_reduce.apply_scan(in_col, out_col, 'sum', inclusive=True)
    return slots
Example #7
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.dataframe.buffer import Buffer

    buffers = pa_arr.buffers()

    if buffers[0]:
        pamask = Buffer(np.array(buffers[0]).view('int8'))
    else:
        pamask = None

    if dtype:
        new_dtype = dtype
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            new_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            new_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset +
                                                 len(pa_arr)])
    else:
        padata = Buffer(np.empty(0, dtype=new_dtype))
    return (pamask, padata)
Example #8
0
def column_empty_like(column, dtype, masked):
    """Allocate a new column like the given *column*
    """
    data = rmm.device_array(shape=len(column), dtype=dtype)
    params = dict(data=Buffer(data))
    if masked:
        mask = utils.make_mask(data.size)
        params.update(dict(mask=Buffer(mask), null_count=data.size))
    return Column(**params)
Example #9
0
 def value_counts(self, method='sort'):
     if method != 'sort':
         msg = 'non sort based value_count() not implemented yet'
         raise NotImplementedError(msg)
     segs, sortedvals = self._unique_segments()
     # Return both values and their counts
     out1 = cudautils.gather(data=sortedvals, index=segs)
     out2 = cudautils.value_count(segs, len(sortedvals))
     out_vals = self.replace(data=Buffer(out1), mask=None)
     out_counts = NumericalColumn(data=Buffer(out2), dtype=np.intp)
     return out_vals, out_counts
Example #10
0
        def from_cffi_view(cffi_view):
            """Create a Column object from a cffi struct gdf_column*.
            """
            data_mem, mask_mem = _gdf.cffi_view_to_column_mem(cffi_view)
            data_buf = Buffer(data_mem)

            if mask_mem is not None:
                mask = Buffer(mask_mem)
            else:
                mask = None

            return Column(data=data_buf, mask=mask)
Example #11
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn

        if len(objs) == 0:
            if pd.api.types.is_categorical_dtype(dtype):
                return CategoricalColumn(data=Column(
                    Buffer.null(np.dtype('int8'))),
                                         null_count=0,
                                         ordered=False)
            elif dtype == np.dtype('object'):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            else:
                dtype = np.dtype(dtype)
                return Column(Buffer.null(dtype))

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            new_cats = tuple(set([val for o in objs for val in o]))
            objs = [o.cat()._set_categories(new_cats) for o in objs]

        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _gdf._column_concat(objs, col)

        return col
Example #12
0
    def __init__(self, values, name=None):
        if isinstance(values, pd.Series) and \
                pd.api.types.is_categorical_dtype(values.dtype):
            values = CategoricalColumn(
                data=Buffer(values.cat.codes.values),
                categories=values.cat.categories.tolist(),
                ordered=values.cat.ordered)
        elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
            values = CategoricalColumn(data=Buffer(values.codes),
                                       categories=values.categories.tolist(),
                                       ordered=values.ordered)

        self._values = values
        self.name = name
        self.names = [name]
Example #13
0
    def from_cffi_view(cffi_view):
        """Create a Column object from a cffi struct gdf_column*.
        """
        from cudf.dataframe import columnops

        data_mem, mask_mem = _gdf.cffi_view_to_column_mem(cffi_view)
        dtype = _gdf.gdf_to_np_dtype(cffi_view.dtype)
        if isinstance(data_mem, nvstrings.nvstrings):
            return columnops.build_column(data_mem, dtype)
        else:
            data_buf = Buffer(data_mem)
            mask = None
            if mask_mem is not None:
                mask = Buffer(mask_mem)
            return columnops.build_column(data_buf, dtype, mask=mask)
Example #14
0
def test_buffer_basic():
    n = 10
    buf = Buffer(np.arange(n, dtype=np.float64))
    assert buf.size == n
    assert buf.capacity == n
    np.testing.assert_equal(buf.mem.copy_to_host(),
                            np.arange(n, dtype=np.float64))
Example #15
0
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs)

        out_col = columnops.as_column(out_arr)

        if self.null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            mask = Buffer(out_mask_arr)
            out_col = out_col.set_mask(mask)

        return out_col.astype(out_dtype)
Example #16
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
Example #17
0
 def reverse(self):
     """Reverse the Series
     """
     data = cudautils.reverse_array(self.to_gpu_array())
     index = as_index(cudautils.reverse_array(self.index.gpu_values))
     col = self._column.replace(data=Buffer(data))
     return self._copy_construct(data=col, index=index)
Example #18
0
    def __getitem__(self, arg):
        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):
            # compute mask slice
            if self.null_count > 0:
                if arg.step is not None and arg.step != 1:
                    raise NotImplementedError(arg)

                # slicing data
                subdata = self.data[arg]
                # slicing mask
                bytemask = cudautils.expand_mask_bits(
                    self.data.size,
                    self.mask.to_gpu_array(),
                )
                submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg]))
                col = self.replace(data=subdata, mask=submask)
                return col
            else:
                newbuffer = self.data[arg]
                return self.replace(data=newbuffer)
        elif isinstance(arg, (list, np.ndarray)):
            arg = np.array(arg)
            arg = rmm.to_device(arg)

        if isinstance(arg, DeviceNDArray):
            return self.take(arg)
        else:
            raise NotImplementedError(type(arg))
Example #19
0
    def as_numerical(self):
        from cudf.dataframe import numerical

        data = Buffer(self.data.mem.view(np.int64))
        return self.view(numerical.NumericalColumn,
                         data=data,
                         dtype=data.dtype)
Example #20
0
    def __init__(self, data, null_count=None, **kwargs):
        """
        Parameters
        ----------
        data : nvstrings.nvstrings
            The nvstrings object
        null_count : int; optional
            The number of null values in the mask.
        """
        from collections.abc import Sequence
        if isinstance(data, Sequence):
            data = nvstrings.to_device(data)
        assert isinstance(data, nvstrings.nvstrings)
        self._data = data
        self._dtype = np.dtype("object")

        if null_count is None:
            null_count = data.null_count()
        self._null_count = null_count
        self._mask = None
        if self._null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype='int8')
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            self._mask = Buffer(out_mask_arr)
        self._nvcategory = None
        self._indices = None
Example #21
0
def column_select_by_position(column, positions):
    """Select by a series of dtype int64 indicating positions.

    Returns (selected_column, selected_positions)
    """
    from cudf.dataframe.numerical import NumericalColumn
    assert column.null_count == 0

    selvals = cudautils.gather(column.data.to_gpu_array(),
                               positions.data.to_gpu_array())

    selected_values = column.replace(data=Buffer(selvals))
    selected_index = Buffer(positions.data.to_gpu_array())

    return selected_values, NumericalColumn(data=selected_index,
                                            dtype=selected_index.dtype)
Example #22
0
 def sort_by_values(self, ascending=True, na_position="last"):
     sort_inds = get_sorted_inds(self, ascending, na_position)
     col_keys = cudautils.gather(data=self.data.mem,
                                 index=sort_inds.data.mem)
     mask = None
     if self.mask:
         mask = self._get_mask_as_column()\
             .take(sort_inds.data.to_gpu_array()).as_mask()
         mask = Buffer(mask)
     col_keys = self.replace(data=Buffer(col_keys),
                             mask=mask,
                             null_count=self.null_count,
                             dtype=self.dtype)
     col_inds = self.replace(data=sort_inds.data,
                             mask=sort_inds.mask,
                             dtype=sort_inds.data.dtype)
     return col_keys, col_inds
Example #23
0
 def as_column(self):
     if len(self) > 0:
         vals = cudautils.arange(self._start, self._stop, dtype=self.dtype)
     else:
         vals = rmm.device_array(0, dtype=self.dtype)
     return NumericalColumn(data=Buffer(vals),
                            dtype=vals.dtype,
                            name=self.name)
Example #24
0
def column_select_by_boolmask(column, boolmask):
    """Select by a boolean mask to a column.

    Returns (selected_column, selected_positions)
    """
    from cudf.dataframe.numerical import NumericalColumn
    assert column.null_count == 0  # We don't properly handle the boolmask yet
    boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array())
    indices = cudautils.arange(len(boolmask))
    _, selinds = cudautils.copy_to_dense(indices, mask=boolbits)
    _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(),
                                         mask=boolbits)

    selected_values = column.replace(data=Buffer(selvals))
    selected_index = Buffer(selinds)
    return selected_values, NumericalColumn(data=selected_index,
                                            dtype=selected_index.dtype)
Example #25
0
 def indices(self):
     if self._indices is None:
         out_dev_arr = rmm.device_array(self.nvcategory.size(),
                                        dtype='int32')
         ptr = get_ctype_ptr(out_dev_arr)
         self.nvcategory.values(devptr=ptr)
         self._indices = Buffer(out_dev_arr)
     return self._indices
Example #26
0
    def _sortjoin(self, other, how='left', return_indexers=False):
        """Join with another column.

        When the column is a index, set *return_indexers* to obtain
        the indices for shuffling the remaining columns.
        """
        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        lkey, largsort = self.sort_by_values(True)
        rkey, rargsort = other.sort_by_values(True)
        with _gdf.apply_join(
                [lkey], [rkey], how=how, method='sort') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        lkey.to_gpu_array(),
                        rkey.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = lkey.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(largsort), lidx),
                            gather(Series(rargsort), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Example #27
0
    def round(self, decimals=0):
        mask = None
        if self.has_null_mask:
            mask = self.nullmask

        rounded = cudautils.apply_round(self.data.mem, decimals)
        return NumericalColumn(data=Buffer(rounded), mask=mask,
                               dtype=self.dtype)
Example #28
0
    def take(self, indices, ignore_index=False):
        """Return Column by taking values from the corresponding *indices*.
        """
        indices = Buffer(indices).to_gpu_array()
        # Handle zero size
        if indices.size == 0:
            return self.copy()

        data = cudautils.gather(data=self._data.to_gpu_array(), index=indices)

        if self._mask:
            mask = self._get_mask_as_column().take(indices).as_mask()
            mask = Buffer(mask)
        else:
            mask = None

        return self.replace(data=Buffer(data), mask=mask)
Example #29
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size")
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            columnops.build_column(Buffer(res[idx]),
                                   dtype=res[idx].dtype,
                                   mask=mask))
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Example #30
0
 def normalize_binop_value(self, other):
     ary = utils.scalar_broadcast_to(self._encode(other),
                                     shape=len(self),
                                     dtype=self.data.dtype)
     col = self.replace(data=Buffer(ary),
                        dtype=self.dtype,
                        categories=self._categories,
                        ordered=self._ordered)
     return col