def serialize(self): header = {"null_count": self._null_count} header["type"] = pickle.dumps(type(self)) frames = [] sub_headers = [] sbuf = rmm.device_array(self._data.byte_count(), dtype="int8") obuf = rmm.device_array(len(self._data) + 1, dtype="int32") mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize) nbuf = rmm.device_array(mask_size, dtype="int8") self.data.to_offsets( get_ctype_ptr(sbuf), get_ctype_ptr(obuf), nbuf=get_ctype_ptr(nbuf), bdevmem=True, ) for item in [nbuf, sbuf, obuf]: sheader = item.__cuda_array_interface__.copy() sheader["dtype"] = item.dtype.str sub_headers.append(sheader) frames.append(item) header["nvstrings"] = len(self._data) header["subheaders"] = sub_headers return header, frames
def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs) out_col = columnops.as_column(out_arr) if self.null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype="int8") out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) mask = Buffer(out_mask_arr) out_col = out_col.set_mask(mask) return out_col.astype(out_dtype)
def astype(self, dtype): if self.dtype is dtype: return self elif (dtype == np.dtype('object') or np.issubdtype(dtype, np.dtype('U').type)): if len(self) > 0: dev_array = self.data.mem dev_ptr = get_ctype_ptr(dev_array) null_ptr = None if self.mask is not None: null_ptr = get_ctype_ptr(self.mask.mem) kwargs = { 'count': len(self), 'nulls': null_ptr, 'bdevmem': True, 'units': 'ms' } data = string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](dev_ptr, **kwargs) else: data = [] return string.StringColumn(data=data) return self.as_numerical.astype(dtype)
def astype(self, dtype): if self.dtype == dtype: return self elif (dtype == np.dtype('object') or np.issubdtype(dtype, np.dtype('U').type)): import nvstrings if np.issubdtype(self.dtype, np.signedinteger): if len(self) > 0: dev_array = self.astype('int32').data.mem dev_ptr = get_ctype_ptr(dev_array) null_ptr = None if self.mask is not None: null_ptr = get_ctype_ptr(self.mask.mem) return string.StringColumn( data=nvstrings.itos( dev_ptr, count=len(self), nulls=null_ptr, bdevmem=True ) ) else: return string.StringColumn( data=nvstrings.to_device( [] ) ) elif np.issubdtype(self.dtype, np.floating): raise NotImplementedError( f"Casting object of {self.dtype} dtype " "to str dtype is not yet supported" ) # dev_array = self.astype('float32').data.mem # dev_ptr = get_ctype_ptr(self.data.mem) # return string.StringColumn( # data=nvstrings.ftos(dev_ptr, count=len(self), # bdevmem=True) # ) elif self.dtype == np.dtype('bool'): raise NotImplementedError( f"Casting object of {self.dtype} dtype " "to str dtype is not yet supported" ) # return string.StringColumn( # data=nvstrings.btos(dev_ptr, count=len(self), # bdevmem=True) # ) elif np.issubdtype(dtype, np.datetime64): return self.astype('int64').view( datetime.DatetimeColumn, dtype=dtype, data=self.data.astype(dtype) ) else: col = self.replace(data=self.data.astype(dtype), dtype=np.dtype(dtype)) return col
def astype(self, dtype): if self.dtype == dtype: return self elif dtype in (np.dtype('int8'), np.dtype('int16'), np.dtype('int32'), np.dtype('int64')): out_arr = rmm.device_array(shape=len(self), dtype='int32') out_ptr = get_ctype_ptr(out_arr) self.str().stoi(devptr=out_ptr) elif dtype in (np.dtype('float32'), np.dtype('float64')): out_arr = rmm.device_array(shape=len(self), dtype='float32') out_ptr = get_ctype_ptr(out_arr) self.str().stof(devptr=out_ptr) out_col = columnops.as_column(out_arr) return out_col.astype(dtype)
def element_indexing(self, arg): if isinstance(arg, Number): arg = int(arg) if arg < 0: arg = len(self) + arg if arg > (len(self) - 1): raise IndexError out = self._data[arg] elif isinstance(arg, slice): out = self._data[arg] elif isinstance(arg, list): out = self._data[arg] elif isinstance(arg, np.ndarray): gpu_arr = rmm.to_device(arg) return self.element_indexing(gpu_arr) elif isinstance(arg, DeviceNDArray): # NVStrings gather call expects an array of int32s arg = cudautils.astype(arg, np.dtype('int32')) if len(arg) > 0: gpu_ptr = get_ctype_ptr(arg) out = self._data.gather(gpu_ptr, len(arg)) else: out = self._data.gather([]) else: raise NotImplementedError(type(arg)) if len(out) == 1: return out.to_host()[0] else: return columnops.as_column(out)
def __init__(self, data, null_count=None, **kwargs): """ Parameters ---------- data : nvstrings.nvstrings The nvstrings object null_count : int; optional The number of null values in the mask. """ from collections.abc import Sequence if isinstance(data, Sequence): data = nvstrings.to_device(data) assert isinstance(data, nvstrings.nvstrings) self._data = data self._dtype = np.dtype("object") if null_count is None: null_count = data.null_count() self._null_count = null_count self._mask = None if self._null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype='int8') out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) self._mask = Buffer(out_mask_arr) self._nvcategory = None self._indices = None
def indices(self): if self._indices is None: out_dev_arr = rmm.device_array(self.nvcategory.size(), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self.nvcategory.values(devptr=ptr) self._indices = Buffer(out_dev_arr) return self._indices
def as_string_column(self, dtype, **kwargs): from cudf.dataframe import string if len(self) > 0: if self.dtype in (np.dtype("int8"), np.dtype("int16")): dev_array = self.astype("int32", **kwargs).data.mem else: dev_array = self.data.mem dev_ptr = get_ctype_ptr(dev_array) null_ptr = None if self.mask is not None: null_ptr = get_ctype_ptr(self.mask.mem) kwargs = {"count": len(self), "nulls": null_ptr, "bdevmem": True} data = string._numeric_to_str_typecast_functions[ np.dtype(dev_array.dtype) ](dev_ptr, **kwargs) else: data = [] return string.StringColumn(data=data)
def astype(self, dtype): if self.dtype == dtype: return self elif (dtype == np.dtype('object') or np.issubdtype(dtype, np.dtype('U').type)): if len(self) > 0: if self.dtype in (np.dtype('int8'), np.dtype('int16')): dev_array = self.astype('int32').data.mem else: dev_array = self.data.mem dev_ptr = get_ctype_ptr(dev_array) null_ptr = None if self.mask is not None: null_ptr = get_ctype_ptr(self.mask.mem) kwargs = { 'count': len(self), 'nulls': null_ptr, 'bdevmem': True } data = string._numeric_to_str_typecast_functions[ np.dtype(dev_array.dtype) ](dev_ptr, **kwargs) else: data = [] return string.StringColumn(data=data) elif np.issubdtype(dtype, np.datetime64): return self.astype('int64').view( datetime.DatetimeColumn, dtype=dtype, data=self.data.astype(dtype) ) else: col = self.replace(data=self.data.astype(dtype), dtype=np.dtype(dtype)) return col
def as_string_column(self, dtype, **kwargs): from cudf.dataframe import string if len(self) > 0: dev_array = self.data.mem dev_ptr = get_ctype_ptr(dev_array) null_ptr = None if self.mask is not None: null_ptr = get_ctype_ptr(self.mask.mem) kwargs.update({ "count": len(self), "nulls": null_ptr, "bdevmem": True, "units": self.time_unit, }) data = string._numeric_to_str_typecast_functions[np.dtype( self.dtype)](dev_ptr, **kwargs) else: data = [] return string.StringColumn(data=data)
def sort_by_values(self, ascending=True, na_position="last"): if na_position == "last": nullfirst = False elif na_position == "first": nullfirst = True idx_dev_arr = rmm.device_array(len(self), dtype="int32") dev_ptr = get_ctype_ptr(idx_dev_arr) self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr) col_inds = columnops.build_column(Buffer(idx_dev_arr), idx_dev_arr.dtype, mask=None) col_keys = self[col_inds.data.mem] return col_keys, col_inds
def sort_by_values(self, ascending=True, na_position="last"): if na_position == "last": nullfirst = False elif na_position == "first": nullfirst = True idx_dev_arr = rmm.device_array(len(self), dtype='int32') dev_ptr = get_ctype_ptr(idx_dev_arr) self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr) col_inds = numerical.NumericalColumn(data=Buffer(idx_dev_arr), mask=None, null_count=0, dtype=idx_dev_arr.dtype) col_keys = self[col_inds.data.mem] return col_keys, col_inds
def astype(self, dtype): if self.dtype == dtype: return self elif dtype in (np.dtype('int8'), np.dtype('int16')): out_dtype = np.dtype(dtype) dtype = np.dtype('int32') else: out_dtype = np.dtype(dtype) out_arr = rmm.device_array(shape=len(self), dtype=dtype) out_ptr = get_ctype_ptr(out_arr) kwargs = {'devptr': out_ptr} if dtype == np.dtype('datetime64[ms]'): kwargs['units'] = 'ms' _str_to_numeric_typecast_functions[np.dtype(dtype)](self.str(), **kwargs) out_col = columnops.as_column(out_arr) return out_col.astype(out_dtype)
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.dataframe.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.len(ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('int32'), mask=mask) return Series(column, index=self._index)
def deserialize(cls, header, frames): # Deserialize the mask, value, and offset frames arrays = [] for i, frame in enumerate(frames): if isinstance(frame, memoryview): sheader = header["subheaders"][i] dtype = sheader["dtype"] frame = np.frombuffer(frame, dtype=dtype) frame = cudautils.to_device(frame) elif not (isinstance(frame, np.ndarray) or numba.cuda.driver.is_device_memory(frame)): # this is probably a ucp_py.BufferRegion memory object # check the header for info -- this should be encoded from # serialization process. Lastly, `typestr` and `shape` *must* # manually set *before* consuming the buffer as a DeviceNDArray sheader = header["subheaders"][i] frame.typestr = sheader.get("dtype", "B") frame.shape = sheader.get("shape", len(frame)) frame = np.frombuffer(frame, dtype=dtype) frame = cudautils.to_device(frame) arrays.append(get_ctype_ptr(frame)) # Use from_offsets to get nvstring data. # Note: array items = [nbuf, sbuf, obuf] scount = header["nvstrings"] data = nvstrings.from_offsets( arrays[1], arrays[2], scount, nbuf=arrays[0], ncount=header["null_count"], bdevmem=True, ) return data
Notes ----- The parameters `case`, `flags`, and `na` are not yet supported and will raise a NotImplementedError if anything other than the default value is set. """ if case is not True: raise NotImplementedError("`case` parameter is not yet supported") elif flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") elif na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") from cudf.dataframe import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='bool') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.contains(pat, regex=regex, devptr=ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('bool'), mask=mask) return Series(column, index=self._index) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): """ Replace occurences of pattern/regex in the Series/Index with some other