Ejemplo n.º 1
0
    def serialize(self):
        header = {"null_count": self._null_count}
        header["type"] = pickle.dumps(type(self))
        frames = []
        sub_headers = []

        sbuf = rmm.device_array(self._data.byte_count(), dtype="int8")
        obuf = rmm.device_array(len(self._data) + 1, dtype="int32")
        mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize)
        nbuf = rmm.device_array(mask_size, dtype="int8")
        self.data.to_offsets(
            get_ctype_ptr(sbuf),
            get_ctype_ptr(obuf),
            nbuf=get_ctype_ptr(nbuf),
            bdevmem=True,
        )
        for item in [nbuf, sbuf, obuf]:
            sheader = item.__cuda_array_interface__.copy()
            sheader["dtype"] = item.dtype.str
            sub_headers.append(sheader)
            frames.append(item)

        header["nvstrings"] = len(self._data)
        header["subheaders"] = sub_headers
        return header, frames
Ejemplo n.º 2
0
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs)

        out_col = columnops.as_column(out_arr)

        if self.null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            mask = Buffer(out_mask_arr)
            out_col = out_col.set_mask(mask)

        return out_col.astype(out_dtype)
Ejemplo n.º 3
0
    def astype(self, dtype):
        if self.dtype is dtype:
            return self
        elif (dtype == np.dtype('object')
              or np.issubdtype(dtype,
                               np.dtype('U').type)):
            if len(self) > 0:
                dev_array = self.data.mem
                dev_ptr = get_ctype_ptr(dev_array)
                null_ptr = None
                if self.mask is not None:
                    null_ptr = get_ctype_ptr(self.mask.mem)
                kwargs = {
                    'count': len(self),
                    'nulls': null_ptr,
                    'bdevmem': True,
                    'units': 'ms'
                }
                data = string._numeric_to_str_typecast_functions[np.dtype(
                    self.dtype)](dev_ptr, **kwargs)

            else:
                data = []

            return string.StringColumn(data=data)

        return self.as_numerical.astype(dtype)
Ejemplo n.º 4
0
 def astype(self, dtype):
     if self.dtype == dtype:
         return self
     elif (dtype == np.dtype('object') or
           np.issubdtype(dtype, np.dtype('U').type)):
         import nvstrings
         if np.issubdtype(self.dtype, np.signedinteger):
             if len(self) > 0:
                 dev_array = self.astype('int32').data.mem
                 dev_ptr = get_ctype_ptr(dev_array)
                 null_ptr = None
                 if self.mask is not None:
                     null_ptr = get_ctype_ptr(self.mask.mem)
                 return string.StringColumn(
                     data=nvstrings.itos(
                         dev_ptr,
                         count=len(self),
                         nulls=null_ptr,
                         bdevmem=True
                     )
                 )
             else:
                 return string.StringColumn(
                     data=nvstrings.to_device(
                         []
                     )
                 )
         elif np.issubdtype(self.dtype, np.floating):
             raise NotImplementedError(
                 f"Casting object of {self.dtype} dtype "
                 "to str dtype is not yet supported"
             )
             # dev_array = self.astype('float32').data.mem
             # dev_ptr = get_ctype_ptr(self.data.mem)
             # return string.StringColumn(
             #     data=nvstrings.ftos(dev_ptr, count=len(self),
             #                         bdevmem=True)
             # )
         elif self.dtype == np.dtype('bool'):
             raise NotImplementedError(
                 f"Casting object of {self.dtype} dtype "
                 "to str dtype is not yet supported"
             )
             # return string.StringColumn(
             #     data=nvstrings.btos(dev_ptr, count=len(self),
             #                         bdevmem=True)
             # )
     elif np.issubdtype(dtype, np.datetime64):
         return self.astype('int64').view(
             datetime.DatetimeColumn,
             dtype=dtype,
             data=self.data.astype(dtype)
         )
     else:
         col = self.replace(data=self.data.astype(dtype),
                            dtype=np.dtype(dtype))
         return col
Ejemplo n.º 5
0
 def astype(self, dtype):
     if self.dtype == dtype:
         return self
     elif dtype in (np.dtype('int8'), np.dtype('int16'), np.dtype('int32'),
                    np.dtype('int64')):
         out_arr = rmm.device_array(shape=len(self), dtype='int32')
         out_ptr = get_ctype_ptr(out_arr)
         self.str().stoi(devptr=out_ptr)
     elif dtype in (np.dtype('float32'), np.dtype('float64')):
         out_arr = rmm.device_array(shape=len(self), dtype='float32')
         out_ptr = get_ctype_ptr(out_arr)
         self.str().stof(devptr=out_ptr)
     out_col = columnops.as_column(out_arr)
     return out_col.astype(dtype)
Ejemplo n.º 6
0
    def element_indexing(self, arg):
        if isinstance(arg, Number):
            arg = int(arg)
            if arg < 0:
                arg = len(self) + arg
            if arg > (len(self) - 1):
                raise IndexError
            out = self._data[arg]
        elif isinstance(arg, slice):
            out = self._data[arg]
        elif isinstance(arg, list):
            out = self._data[arg]
        elif isinstance(arg, np.ndarray):
            gpu_arr = rmm.to_device(arg)
            return self.element_indexing(gpu_arr)
        elif isinstance(arg, DeviceNDArray):
            # NVStrings gather call expects an array of int32s
            arg = cudautils.astype(arg, np.dtype('int32'))
            if len(arg) > 0:
                gpu_ptr = get_ctype_ptr(arg)
                out = self._data.gather(gpu_ptr, len(arg))
            else:
                out = self._data.gather([])
        else:
            raise NotImplementedError(type(arg))

        if len(out) == 1:
            return out.to_host()[0]
        else:
            return columnops.as_column(out)
Ejemplo n.º 7
0
    def __init__(self, data, null_count=None, **kwargs):
        """
        Parameters
        ----------
        data : nvstrings.nvstrings
            The nvstrings object
        null_count : int; optional
            The number of null values in the mask.
        """
        from collections.abc import Sequence
        if isinstance(data, Sequence):
            data = nvstrings.to_device(data)
        assert isinstance(data, nvstrings.nvstrings)
        self._data = data
        self._dtype = np.dtype("object")

        if null_count is None:
            null_count = data.null_count()
        self._null_count = null_count
        self._mask = None
        if self._null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype='int8')
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            self._mask = Buffer(out_mask_arr)
        self._nvcategory = None
        self._indices = None
Ejemplo n.º 8
0
 def indices(self):
     if self._indices is None:
         out_dev_arr = rmm.device_array(self.nvcategory.size(),
                                        dtype='int32')
         ptr = get_ctype_ptr(out_dev_arr)
         self.nvcategory.values(devptr=ptr)
         self._indices = Buffer(out_dev_arr)
     return self._indices
Ejemplo n.º 9
0
    def as_string_column(self, dtype, **kwargs):
        from cudf.dataframe import string

        if len(self) > 0:
            if self.dtype in (np.dtype("int8"), np.dtype("int16")):
                dev_array = self.astype("int32", **kwargs).data.mem
            else:
                dev_array = self.data.mem
            dev_ptr = get_ctype_ptr(dev_array)
            null_ptr = None
            if self.mask is not None:
                null_ptr = get_ctype_ptr(self.mask.mem)
            kwargs = {"count": len(self), "nulls": null_ptr, "bdevmem": True}
            data = string._numeric_to_str_typecast_functions[
                np.dtype(dev_array.dtype)
            ](dev_ptr, **kwargs)
        else:
            data = []
        return string.StringColumn(data=data)
Ejemplo n.º 10
0
    def astype(self, dtype):
        if self.dtype == dtype:
            return self

        elif (dtype == np.dtype('object') or
              np.issubdtype(dtype, np.dtype('U').type)):
            if len(self) > 0:
                if self.dtype in (np.dtype('int8'), np.dtype('int16')):
                    dev_array = self.astype('int32').data.mem
                else:
                    dev_array = self.data.mem
                dev_ptr = get_ctype_ptr(dev_array)
                null_ptr = None
                if self.mask is not None:
                    null_ptr = get_ctype_ptr(self.mask.mem)
                kwargs = {
                    'count': len(self),
                    'nulls': null_ptr,
                    'bdevmem': True
                }
                data = string._numeric_to_str_typecast_functions[
                    np.dtype(dev_array.dtype)
                ](dev_ptr, **kwargs)

            else:
                data = []

            return string.StringColumn(data=data)

        elif np.issubdtype(dtype, np.datetime64):
            return self.astype('int64').view(
                datetime.DatetimeColumn,
                dtype=dtype,
                data=self.data.astype(dtype)
            )

        else:
            col = self.replace(data=self.data.astype(dtype),
                               dtype=np.dtype(dtype))
            return col
Ejemplo n.º 11
0
    def as_string_column(self, dtype, **kwargs):
        from cudf.dataframe import string

        if len(self) > 0:
            dev_array = self.data.mem
            dev_ptr = get_ctype_ptr(dev_array)
            null_ptr = None
            if self.mask is not None:
                null_ptr = get_ctype_ptr(self.mask.mem)
            kwargs.update({
                "count": len(self),
                "nulls": null_ptr,
                "bdevmem": True,
                "units": self.time_unit,
            })
            data = string._numeric_to_str_typecast_functions[np.dtype(
                self.dtype)](dev_ptr, **kwargs)

        else:
            data = []

        return string.StringColumn(data=data)
Ejemplo n.º 12
0
    def sort_by_values(self, ascending=True, na_position="last"):
        if na_position == "last":
            nullfirst = False
        elif na_position == "first":
            nullfirst = True

        idx_dev_arr = rmm.device_array(len(self), dtype="int32")
        dev_ptr = get_ctype_ptr(idx_dev_arr)
        self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr)

        col_inds = columnops.build_column(Buffer(idx_dev_arr),
                                          idx_dev_arr.dtype,
                                          mask=None)

        col_keys = self[col_inds.data.mem]

        return col_keys, col_inds
Ejemplo n.º 13
0
    def sort_by_values(self, ascending=True, na_position="last"):
        if na_position == "last":
            nullfirst = False
        elif na_position == "first":
            nullfirst = True

        idx_dev_arr = rmm.device_array(len(self), dtype='int32')
        dev_ptr = get_ctype_ptr(idx_dev_arr)
        self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr)

        col_inds = numerical.NumericalColumn(data=Buffer(idx_dev_arr),
                                             mask=None,
                                             null_count=0,
                                             dtype=idx_dev_arr.dtype)

        col_keys = self[col_inds.data.mem]

        return col_keys, col_inds
Ejemplo n.º 14
0
    def astype(self, dtype):
        if self.dtype == dtype:
            return self
        elif dtype in (np.dtype('int8'), np.dtype('int16')):
            out_dtype = np.dtype(dtype)
            dtype = np.dtype('int32')
        else:
            out_dtype = np.dtype(dtype)

        out_arr = rmm.device_array(shape=len(self), dtype=dtype)
        out_ptr = get_ctype_ptr(out_arr)
        kwargs = {'devptr': out_ptr}
        if dtype == np.dtype('datetime64[ms]'):
            kwargs['units'] = 'ms'
        _str_to_numeric_typecast_functions[np.dtype(dtype)](self.str(),
                                                            **kwargs)

        out_col = columnops.as_column(out_arr)
        return out_col.astype(out_dtype)
Ejemplo n.º 15
0
    def len(self):
        """
        Computes the length of each element in the Series/Index.

        Returns
        -------
          Series or Index of int: A Series or Index of integer values
            indicating the length of each element in the Series or Index.
        """
        from cudf.dataframe.series import Series
        out_dev_arr = rmm.device_array(len(self._parent), dtype='int32')
        ptr = get_ctype_ptr(out_dev_arr)
        self._parent.data.len(ptr)

        mask = None
        if self._parent.null_count > 0:
            mask = self._parent.mask

        column = columnops.build_column(Buffer(out_dev_arr),
                                        np.dtype('int32'),
                                        mask=mask)
        return Series(column, index=self._index)
Ejemplo n.º 16
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        arrays = []

        for i, frame in enumerate(frames):
            if isinstance(frame, memoryview):
                sheader = header["subheaders"][i]
                dtype = sheader["dtype"]
                frame = np.frombuffer(frame, dtype=dtype)
                frame = cudautils.to_device(frame)
            elif not (isinstance(frame, np.ndarray)
                      or numba.cuda.driver.is_device_memory(frame)):
                # this is probably a ucp_py.BufferRegion memory object
                # check the header for info -- this should be encoded from
                # serialization process.  Lastly, `typestr` and `shape` *must*
                # manually set *before* consuming the buffer as a DeviceNDArray
                sheader = header["subheaders"][i]
                frame.typestr = sheader.get("dtype", "B")
                frame.shape = sheader.get("shape", len(frame))
                frame = np.frombuffer(frame, dtype=dtype)
                frame = cudautils.to_device(frame)

            arrays.append(get_ctype_ptr(frame))

        # Use from_offsets to get nvstring data.
        # Note: array items = [nbuf, sbuf, obuf]
        scount = header["nvstrings"]
        data = nvstrings.from_offsets(
            arrays[1],
            arrays[2],
            scount,
            nbuf=arrays[0],
            ncount=header["null_count"],
            bdevmem=True,
        )
        return data
Ejemplo n.º 17
0
        Notes
        -----
        The parameters `case`, `flags`, and `na` are not yet supported and
        will raise a NotImplementedError if anything other than the default
        value is set.
        """
        if case is not True:
            raise NotImplementedError("`case` parameter is not yet supported")
        elif flags != 0:
            raise NotImplementedError("`flags` parameter is not yet supported")
        elif na is not np.nan:
            raise NotImplementedError("`na` parameter is not yet supported")

        from cudf.dataframe import Series
        out_dev_arr = rmm.device_array(len(self._parent), dtype='bool')
        ptr = get_ctype_ptr(out_dev_arr)
        self._parent.data.contains(pat, regex=regex, devptr=ptr)

        mask = None
        if self._parent.null_count > 0:
            mask = self._parent.mask

        column = columnops.build_column(Buffer(out_dev_arr),
                                        np.dtype('bool'),
                                        mask=mask)

        return Series(column, index=self._index)

    def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
        """
        Replace occurences of pattern/regex in the Series/Index with some other