Beispiel #1
0
def _mask_from_cuda_array_interface_desc(obj):
    from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize
    from cudf.utils.cudautils import compact_mask_bytes

    desc = obj.__cuda_array_interface__
    mask = desc.get("mask", None)

    if mask is not None:
        desc = mask.__cuda_array_interface__
        ptr = desc["data"][0]
        nelem = desc["shape"][0]
        typestr = desc["typestr"]
        typecode = typestr[1]
        if typecode == "t":
            nelem = calc_chunk_size(nelem, mask_bitsize)
            mask = Buffer(
                data=ptr, size=nelem * mask_dtype.itemsize, owner=obj
            )
        elif typecode == "b":
            dtype = np.dtype(typestr)
            mask = compact_mask_bytes(
                rmm.device_array_from_ptr(
                    ptr, nelem=nelem, dtype=dtype, finalizer=None
                )
            )
            mask = Buffer(mask)
        else:
            raise NotImplementedError(
                f"Cannot infer mask from typestr {typestr}"
            )
    return mask
Beispiel #2
0
def test_buffer_from_cuda_iface_contiguous(data):
    data, expect_success = data
    if expect_success:
        buf = Buffer(data=data.view("|u1"), size=data.size)  # noqa: F841
    else:
        with pytest.raises(ValueError):
            buf = Buffer(data=data.view("|u1"), size=data.size)  # noqa: F841
Beispiel #3
0
 def deserialize(cls, header, frames):
     dtype = header["dtype"]
     data = Buffer.deserialize(header["data"], [frames[0]])
     mask = None
     if "mask" in header:
         mask = Buffer.deserialize(header["mask"], [frames[1]])
     return build_column(data=data, dtype=dtype, mask=mask)
Beispiel #4
0
def pyarrow_buffer_to_cudf_buffer(arrow_buf, mask_size=0):
    """
    Given a PyArrow Buffer backed by either host or device memory, convert it
    to a cuDF Buffer
    """
    from cudf._lib.arrow._cuda import CudaBuffer as arrowCudaBuffer

    # Try creating a PyArrow CudaBuffer from the PyArrow Buffer object, it
    # fails with an ArrowTypeError if it's a host based Buffer so we catch and
    # process as expected
    if not isinstance(arrow_buf, pa.Buffer):
        raise TypeError(
            "Expected type: {}, got type: {}".format(
                pa.Buffer.__name__, type(arrow_buf).__name__
            )
        )

    try:
        arrow_cuda_buf = arrowCudaBuffer.from_buffer(arrow_buf)
        buf = Buffer(
            data=arrow_cuda_buf.address,
            size=arrow_cuda_buf.size,
            owner=arrow_cuda_buf,
        )
        if buf.size < mask_size:
            dbuf = rmm.DeviceBuffer(size=mask_size)
            dbuf.copy_from_device(buf)
            return Buffer(dbuf)
        return buf
    except pa.ArrowTypeError:
        if arrow_buf.size < mask_size:
            dbuf = rmm.DeviceBuffer(size=mask_size)
            dbuf.copy_from_host(np.asarray(arrow_buf).view("u1"))
            return Buffer(dbuf)
        return Buffer(arrow_buf)
Beispiel #5
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if buffers[0]:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.array(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    if dtype:
        new_dtype = dtype
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            new_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            new_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset +
                                                 len(pa_arr)])
    else:
        padata = Buffer(np.empty(0, dtype=new_dtype))
    return (pamask, padata)
Beispiel #6
0
    def from_numpy(cls, array):
        cast_dtype = array.dtype.type == np.int64
        if array.dtype.kind == "M":
            time_unit, _ = np.datetime_data(array.dtype)
            cast_dtype = time_unit in ("D", "W", "M", "Y") or (
                len(array) > 0 and (isinstance(array[0], str)
                                    or isinstance(array[0], dt.datetime)))
        elif not cast_dtype:
            raise ValueError(
                ("Cannot infer datetime dtype " + "from np.array dtype `%s`") %
                (array.dtype))

        if cast_dtype:
            array = array.astype(np.dtype("datetime64[s]"))
        assert array.dtype.itemsize == 8

        mask = None
        if np.any(np.isnat(array)):
            null = cudf.core.column.column_empty_like(array,
                                                      masked=True,
                                                      newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(array), dtype=array.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")],
                                    dtype=array.dtype)),
                    dtype=array.dtype,
                ),
                null,
            )
            mask = col.mask

        return cls(data=Buffer(array), mask=mask, dtype=array.dtype)
Beispiel #7
0
 def deserialize(cls, header, frames):
     dtype = header["dtype"]
     data = Buffer(frames[0])
     mask = None
     if header["frame_count"] > 1:
         mask = Buffer(frames[1])
     return build_column(data=data, dtype=dtype, mask=mask)
Beispiel #8
0
def column_empty(row_count, dtype="object", masked=False):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)
    children = ()

    if is_categorical_dtype(dtype):
        data = None
        children = (build_column(
            data=Buffer.empty(row_count * np.dtype("int32").itemsize),
            dtype="int32",
        ), )
    elif dtype.kind in "OU":
        data = None
        children = (
            build_column(
                data=Buffer.empty(
                    (row_count + 1) * np.dtype("int32").itemsize),
                dtype="int32",
            ),
            build_column(
                data=Buffer.empty(row_count * np.dtype("int8").itemsize),
                dtype="int8",
            ),
        )
    else:
        data = Buffer.empty(row_count * dtype.itemsize)

    if masked:
        mask = Buffer(cudautils.make_empty_mask(row_count))
    else:
        mask = None

    return build_column(data, dtype, mask=mask, children=children)
Beispiel #9
0
def prefixsum(vals):
    """Compute the full prefixsum.

    Given the input of N.  The output size is N + 1.
    The first value is always 0.  The last value is the sum of *vals*.
    """
    import cudf._lib as libcudf

    from cudf.core.column import NumericalColumn
    from cudf.core.buffer import Buffer

    # Allocate output
    slots = rmm.device_array(shape=vals.size + 1, dtype=vals.dtype)
    # Fill 0 to slot[0]
    gpu_fill_value[1, 1](slots[:1], 0)

    # Compute prefixsum on the mask
    in_col = NumericalColumn(data=Buffer(vals),
                             mask=None,
                             null_count=0,
                             dtype=vals.dtype)
    out_col = NumericalColumn(data=Buffer(slots[1:]),
                              mask=None,
                              null_count=0,
                              dtype=vals.dtype)
    libcudf.reduce.scan(in_col, out_col, "sum", inclusive=True)
    return slots
Beispiel #10
0
def column_empty(row_count, dtype="object", masked=False):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)
    children = ()

    if is_categorical_dtype(dtype):
        data = None
        children = (build_column(
            data=Buffer.empty(row_count * np.dtype("int32").itemsize),
            dtype="int32",
        ), )
    elif dtype.kind in "OU":
        data = None
        children = (
            build_column(
                data=Buffer(cupy.zeros(row_count + 1, dtype="int32")),
                dtype="int32",
            ),
            build_column(
                data=Buffer.empty(row_count * np.dtype("int8").itemsize),
                dtype="int8",
            ),
        )
    else:
        data = Buffer.empty(row_count * dtype.itemsize)

    if masked:
        mask = create_null_mask(row_count, state=MaskState.ALL_NULL)
    else:
        mask = None

    return build_column(data, dtype, mask=mask, children=children)
Beispiel #11
0
    def _initialize_read(self):
        from cudf.core.buffer import Buffer
        from cupy.cuda.memory import UnownedMemory

        self._offset = 0
        self._has_read_headers = False
        self._buffers = []
        headers, buffers = _id_to_buffers[self._object_id]
        self._headers = headers = headers.copy()
        buffer_types = []
        for buf in buffers:
            if isinstance(buf, cupy.ndarray):
                ptr, size = buf.data.ptr, buf.size
                self._buffers.append(
                    UnownedMemory(ptr, size, Buffer(ptr, size)))
                buffer_types.append(["cuda", size])
            elif isinstance(buf, Buffer):
                ptr, size = buf.ptr, buf.size
                if size == 0:
                    # empty buffer cannot construct a UnownedMemory
                    self._buffers.append(None)
                else:
                    self._buffers.append(
                        UnownedMemory(ptr, size, Buffer(ptr, size)))
                buffer_types.append(["cuda", size])
            else:
                size = getattr(buf, "size", len(buf))
                self._buffers.append(buf)
                buffer_types.append(["memory", size])
        headers["buffer_types"] = buffer_types
Beispiel #12
0
def test_buffer_copy(size):
    data = cp.random.randint(low=0, high=100, size=size, dtype="u1")
    buf = Buffer(data=data)
    got = buf.copy()
    assert got.size == buf.size
    if size > 0:
        assert got.ptr != buf.ptr
    assert_array_equal(cp.asarray(buf), cp.asarray(got))
Beispiel #13
0
def pa_mask_buffer_to_mask(mask_buf, size):
    """
    Convert PyArrow mask buffer to cuDF mask buffer
    """
    mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size)
    if mask_buf.size < mask_size:
        dbuf = rmm.DeviceBuffer(size=mask_size)
        dbuf.copy_from_host(np.asarray(mask_buf).view("u1"))
        return Buffer(dbuf)
    return Buffer(mask_buf)
Beispiel #14
0
 def _write_cuda_buffer(ptr):  # pragma: no cover
     # copy cuda buffer to host
     chunk_size = CUDA_CHUNK_SIZE
     offset = 0
     nbytes = buffer.nbytes
     while offset < nbytes:
         size = chunk_size if (offset +
                               chunk_size) < nbytes else nbytes - offset
         chunk_buffer = CPBuffer(ptr + offset, size=size)
         # copy chunk to host memoryview
         writer.write(chunk_buffer.host_serialize()[1][0])
         offset += size
Beispiel #15
0
    def children(self):
        if self._children is None:
            codes_column = self.base_children[0]

            buf = Buffer(codes_column.base_data)
            buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
            buf.size = self.size * codes_column.dtype.itemsize

            codes_column = column.build_column(
                data=buf, dtype=codes_column.dtype, size=self.size,
            )
            self._children = (codes_column,)
        return self._children
Beispiel #16
0
    def __init__(self, data, null_count=None, name=None, **kwargs):
        """
        Parameters
        ----------
        data : nvstrings.nvstrings
            The nvstrings object
        null_count : int; optional
            The number of null values in the mask.
        """
        from collections.abc import Sequence

        if isinstance(data, Sequence):
            data = nvstrings.to_device(data)
        assert isinstance(data, nvstrings.nvstrings)
        self._data = data
        self._dtype = np.dtype("object")
        self._name = name

        if null_count is None:
            null_count = data.null_count()
        self._null_count = null_count
        self._mask = None
        if self._null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = libcudf.cudf.get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            self._mask = Buffer(out_mask_arr)
        self._nvcategory = None
        self._indices = None
Beispiel #17
0
 def __init__(self, dtype, mask=None, offset=0, children=()):
     """
     Parameters
     ----------
     dtype : CategoricalDtype
     mask : Buffer
         The validity mask
     offset : int
         Data offset
     children : Tuple[Column]
         Two non-null columns containing the categories and codes
         respectively
     """
     data = Buffer.empty(0)
     size = children[0].size
     if isinstance(dtype, pd.api.types.CategoricalDtype):
         dtype = CategoricalDtype.from_pandas(dtype)
     if not isinstance(dtype, CategoricalDtype):
         raise ValueError("dtype must be instance of CategoricalDtype")
     super().__init__(
         data,
         size=size,
         dtype=dtype,
         mask=mask,
         offset=offset,
         children=children,
     )
     self._codes = None
Beispiel #18
0
    def children(self) -> Tuple[NumericalColumn]:
        if self._children is None:
            codes_column = self.base_children[0]

            buf = Buffer(codes_column.base_data)
            buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
            buf.size = self.size * codes_column.dtype.itemsize

            codes_column = cast(
                cudf.core.column.NumericalColumn,
                column.build_column(
                    data=buf, dtype=codes_column.dtype, size=self.size,
                ),
            )
            self._children = (codes_column,)
        return self._children
Beispiel #19
0
    def deserialize(cls, header, frames):

        # Get null mask
        if header["null_count"] > 0:
            mask = Buffer(frames[-1])
        else:
            mask = None

        # Deserialize child columns
        children = []
        f = 0
        for h in header["subheaders"]:
            fcount = h["frame_count"]
            child_frames = frames[f:f + fcount]
            column_type = pickle.loads(h["type-serialized"])
            children.append(column_type.deserialize(h, child_frames))
            f += fcount

        # Materialize list column
        return column.build_column(
            data=None,
            dtype=pickle.loads(header["dtype"]),
            mask=mask,
            children=tuple(children),
            size=header["size"],
        )
Beispiel #20
0
    def __init__(self, mask=None, offset=0, children=()):
        """
        Parameters
        ----------
        mask : Buffer
            The validity mask
        offset : int
            Data offset
        children : Tuple[Column]
            Two non-null columns containing the string data and offsets
            respectively
        """

        data = Buffer.empty(0)
        dtype = np.dtype("object")

        if children[0].size == 0:
            size = 0
        else:
            # one less because the last element of offsets is the number of
            # bytes in the data buffer
            size = children[0].size - 1

        super().__init__(data, size, dtype, mask=mask, children=children)

        self._nvstrings = None
        self._nvcategory = None
        self._indices = None
Beispiel #21
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    """
    Given a pyarrow array returns a 5 length tuple of:
        - size
        - offset
        - cudf.Buffer --> mask
        - cudf.Buffer --> data
        - cudf.Buffer --> string characters
    """
    from cudf._libxx.null_mask import bitmask_allocation_size_bytes

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_size = bitmask_allocation_size_bytes(len(pa_arr))
        pamask = pyarrow_buffer_to_cudf_buffer(buffers[0], mask_size=mask_size)
    else:
        pamask = None

    offset = pa_arr.offset
    size = len(pa_arr)

    if buffers[1]:
        padata = pyarrow_buffer_to_cudf_buffer(buffers[1])
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = pyarrow_buffer_to_cudf_buffer(buffers[2])
    return (size, offset, pamask, padata, pastrs)
Beispiel #22
0
    def as_numerical(self):
        from cudf.core.column import numerical

        data = Buffer(self.data.mem.view(np.int64))
        return self.view(numerical.NumericalColumn,
                         data=data,
                         dtype=data.dtype)
Beispiel #23
0
 def normalize_binop_value(
     self, other: ScalarLike
 ) -> Union[ColumnBase, ScalarLike]:
     if other is None:
         return other
     if isinstance(other, cudf.Scalar):
         if self.dtype == other.dtype:
             return other
         # expensive device-host transfer just to
         # adjust the dtype
         other = other.value
     elif isinstance(other, np.ndarray) and other.ndim == 0:
         other = other.item()
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         if isinstance(other, cudf.Scalar):
             return other
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other_dtype = np.dtype("float32")
             other = other_dtype.type(other)
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(
                 other, size=len(self), dtype=other_dtype
             )
             return column.build_column(
                 data=Buffer(ary), dtype=ary.dtype, mask=self.mask,
             )
     else:
         raise TypeError(f"cannot broadcast {type(other)}")
Beispiel #24
0
 def normalize_binop_value(self, other):
     if other is None:
         return other
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other = np.dtype("float32").type(other)
             other_dtype = other.dtype
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(other,
                                             size=len(self),
                                             dtype=other_dtype)
             return column.build_column(
                 data=Buffer.from_array_lik(ary),
                 dtype=ary.dtype,
                 mask=self.mask,
             )
     else:
         raise TypeError("cannot broadcast {}".format(type(other)))
Beispiel #25
0
def test_buffer_from_cuda_iface_dtype(data, dtype):
    data = data.astype(dtype)
    if dtype not in ("uint8", "int8"):
        with pytest.raises(
            TypeError, match="Buffer data must be of uint8 type"
        ):
            buf = Buffer(data=data, size=data.size)  # noqa: F841
Beispiel #26
0
    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(header["dtype"],
                                             frames[:n_dtype_frames])
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type-serialized"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames:n_dtype_frames + n_data_frames],
        )
        mask = None
        if "mask" in header:
            mask = Buffer.deserialize(header["mask"],
                                      [frames[n_dtype_frames + n_data_frames]])
        return cast(
            CategoricalColumn,
            column.build_column(
                data=None,
                dtype=dtype,
                mask=mask,
                children=(column.as_column(data.base_data,
                                           dtype=data.dtype), ),
            ),
        )
Beispiel #27
0
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = libcudf.cudf.get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs)

        out_col = column.as_column(out_arr)

        if self.null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = libcudf.cudf.get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            mask = Buffer(out_mask_arr)
            out_col = out_col.set_mask(mask)

        return out_col.astype(out_dtype)
Beispiel #28
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size"
            )
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            column.build_column(
                Buffer(res[idx]), dtype=res[idx].dtype, mask=mask
            )
        )
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Beispiel #29
0
def test_pickle_buffer():
    arr = np.arange(10)
    buf = Buffer(arr)
    assert buf.size == arr.nbytes
    pickled = pickle.dumps(buf)
    unpacked = pickle.loads(pickled)
    # Check that unpacked capacity equals buf.size
    assert unpacked.size == arr.nbytes
Beispiel #30
0
def _data_from_cuda_array_interface_desc(obj):
    desc = obj.__cuda_array_interface__
    ptr = desc["data"][0]
    nelem = desc["shape"][0]
    dtype = np.dtype(desc["typestr"])

    data = Buffer(data=ptr, size=nelem * dtype.itemsize, owner=obj)
    return data