def buffers_from_pyarrow(pa_arr, dtype=None): from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import copy_array buffers = pa_arr.buffers() if buffers[0]: mask_dev_array = make_mask(len(pa_arr)) arrow_dev_array = rmm.to_device(np.array(buffers[0]).view('int8')) copy_array(arrow_dev_array, mask_dev_array) pamask = Buffer(mask_dev_array) else: pamask = None if dtype: new_dtype = dtype else: if isinstance(pa_arr, pa.DictionaryArray): new_dtype = pa_arr.indices.type.to_pandas_dtype() else: new_dtype = pa_arr.type.to_pandas_dtype() if buffers[1]: padata = Buffer( np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset + len(pa_arr)]) else: padata = Buffer(np.empty(0, dtype=new_dtype)) return (pamask, padata)
def column_empty(row_count, dtype, masked, categories=None): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) if masked: mask = cudautils.make_empty_mask(row_count) else: mask = None if categories is None and is_categorical_dtype(dtype): categories = [] if dtype.categories is None else dtype.categories if categories is not None: dtype = min_scalar_type(len(categories)) mem = rmm.device_array((row_count, ), dtype=dtype) data = Buffer(mem) dtype = "category" elif dtype.kind in "OU": if row_count == 0: data = nvstrings.to_device([]) else: mem = rmm.device_array((row_count, ), dtype="float64") data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True) else: mem = rmm.device_array((row_count, ), dtype=dtype) data = Buffer(mem) if mask is not None: mask = Buffer(mask) from cudf.dataframe.columnops import build_column return build_column(data, dtype, mask, categories)
def from_mem_views(data_mem, mask_mem=None, null_count=None, name=None): """Create a Column object from a data device array (or nvstrings object), and an optional mask device array """ from cudf.dataframe import columnops if isinstance(data_mem, nvstrings.nvstrings): return columnops.build_column( name=name, buffer=data_mem, dtype=np.dtype("object"), null_count=null_count, ) else: data_buf = Buffer(data_mem) mask = None if mask_mem is not None: mask = Buffer(mask_mem) return columnops.build_column( name=name, buffer=data_buf, dtype=data_mem.dtype, mask=mask, null_count=null_count, )
def take(self, indices, ignore_index=False): """Return Series by taking values from the corresponding *indices*. """ indices = Buffer(indices).to_gpu_array() # Handle zero size if indices.size == 0: return self._copy_construct(data=self.data[:0], index=self.index[:0]) if self.dtype == np.dtype("object"): return self[indices] data = cudautils.gather(data=self.data.to_gpu_array(), index=indices) if self._column.mask: mask = self._get_mask_as_series().take(indices).as_mask() mask = Buffer(mask) else: mask = None if ignore_index: index = RangeIndex(indices.size) else: index = self.index.take(indices) col = self._column.replace(data=Buffer(data), mask=mask) return self._copy_construct(data=col, index=index)
def _mask_from_cuda_array_interface_desc(desc): from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize from cudf.utils.cudautils import compact_mask_bytes mask = desc.get("mask", None) if mask is not None: desc = mask.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": mask = rmm.device_array_from_ptr( ptr, nelem=calc_chunk_size(nelem, mask_bitsize), dtype=mask_dtype, finalizer=None, ) mask = Buffer(mask) elif typecode == "b": dtype = np.dtype(typestr) mask = compact_mask_bytes( rmm.device_array_from_ptr(ptr, nelem=nelem, dtype=dtype, finalizer=None)) mask = Buffer(mask) else: raise NotImplementedError( f"Cannot infer mask from typestr {typestr}") return mask
def prefixsum(vals): """Compute the full prefixsum. Given the input of N. The output size is N + 1. The first value is always 0. The last value is the sum of *vals*. """ import cudf.bindings.reduce as cpp_reduce from cudf.dataframe.numerical import NumericalColumn from cudf.dataframe.buffer import Buffer # Allocate output slots = rmm.device_array(shape=vals.size + 1, dtype=vals.dtype) # Fill 0 to slot[0] gpu_fill_value[1, 1](slots[:1], 0) # Compute prefixsum on the mask in_col = NumericalColumn(data=Buffer(vals), mask=None, null_count=0, dtype=vals.dtype) out_col = NumericalColumn(data=Buffer(slots[1:]), mask=None, null_count=0, dtype=vals.dtype) cpp_reduce.apply_scan(in_col, out_col, 'sum', inclusive=True) return slots
def buffers_from_pyarrow(pa_arr, dtype=None): from cudf.dataframe.buffer import Buffer buffers = pa_arr.buffers() if buffers[0]: pamask = Buffer(np.array(buffers[0]).view('int8')) else: pamask = None if dtype: new_dtype = dtype else: if isinstance(pa_arr, pa.DictionaryArray): new_dtype = pa_arr.indices.type.to_pandas_dtype() else: new_dtype = pa_arr.type.to_pandas_dtype() if buffers[1]: padata = Buffer( np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset + len(pa_arr)]) else: padata = Buffer(np.empty(0, dtype=new_dtype)) return (pamask, padata)
def column_empty_like(column, dtype, masked): """Allocate a new column like the given *column* """ data = rmm.device_array(shape=len(column), dtype=dtype) params = dict(data=Buffer(data)) if masked: mask = utils.make_mask(data.size) params.update(dict(mask=Buffer(mask), null_count=data.size)) return Column(**params)
def value_counts(self, method='sort'): if method != 'sort': msg = 'non sort based value_count() not implemented yet' raise NotImplementedError(msg) segs, sortedvals = self._unique_segments() # Return both values and their counts out1 = cudautils.gather(data=sortedvals, index=segs) out2 = cudautils.value_count(segs, len(sortedvals)) out_vals = self.replace(data=Buffer(out1), mask=None) out_counts = NumericalColumn(data=Buffer(out2), dtype=np.intp) return out_vals, out_counts
def from_cffi_view(cffi_view): """Create a Column object from a cffi struct gdf_column*. """ data_mem, mask_mem = _gdf.cffi_view_to_column_mem(cffi_view) data_buf = Buffer(data_mem) if mask_mem is not None: mask = Buffer(mask_mem) else: mask = None return Column(data=data_buf, mask=mask)
def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple(set([val for o in objs for val in o])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def __init__(self, values, name=None): if isinstance(values, pd.Series) and \ pd.api.types.is_categorical_dtype(values.dtype): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories.tolist(), ordered=values.cat.ordered) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn(data=Buffer(values.codes), categories=values.categories.tolist(), ordered=values.ordered) self._values = values self.name = name self.names = [name]
def from_cffi_view(cffi_view): """Create a Column object from a cffi struct gdf_column*. """ from cudf.dataframe import columnops data_mem, mask_mem = _gdf.cffi_view_to_column_mem(cffi_view) dtype = _gdf.gdf_to_np_dtype(cffi_view.dtype) if isinstance(data_mem, nvstrings.nvstrings): return columnops.build_column(data_mem, dtype) else: data_buf = Buffer(data_mem) mask = None if mask_mem is not None: mask = Buffer(mask_mem) return columnops.build_column(data_buf, dtype, mask=mask)
def test_buffer_basic(): n = 10 buf = Buffer(np.arange(n, dtype=np.float64)) assert buf.size == n assert buf.capacity == n np.testing.assert_equal(buf.mem.copy_to_host(), np.arange(n, dtype=np.float64))
def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs) out_col = columnops.as_column(out_arr) if self.null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype="int8") out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) mask = Buffer(out_mask_arr) out_col = out_col.set_mask(mask) return out_col.astype(out_dtype)
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def reverse(self): """Reverse the Series """ data = cudautils.reverse_array(self.to_gpu_array()) index = as_index(cudautils.reverse_array(self.index.gpu_values)) col = self._column.replace(data=Buffer(data)) return self._copy_construct(data=col, index=index)
def __getitem__(self, arg): if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): # compute mask slice if self.null_count > 0: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data subdata = self.data[arg] # slicing mask bytemask = cudautils.expand_mask_bits( self.data.size, self.mask.to_gpu_array(), ) submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg])) col = self.replace(data=subdata, mask=submask) return col else: newbuffer = self.data[arg] return self.replace(data=newbuffer) elif isinstance(arg, (list, np.ndarray)): arg = np.array(arg) arg = rmm.to_device(arg) if isinstance(arg, DeviceNDArray): return self.take(arg) else: raise NotImplementedError(type(arg))
def as_numerical(self): from cudf.dataframe import numerical data = Buffer(self.data.mem.view(np.int64)) return self.view(numerical.NumericalColumn, data=data, dtype=data.dtype)
def __init__(self, data, null_count=None, **kwargs): """ Parameters ---------- data : nvstrings.nvstrings The nvstrings object null_count : int; optional The number of null values in the mask. """ from collections.abc import Sequence if isinstance(data, Sequence): data = nvstrings.to_device(data) assert isinstance(data, nvstrings.nvstrings) self._data = data self._dtype = np.dtype("object") if null_count is None: null_count = data.null_count() self._null_count = null_count self._mask = None if self._null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype='int8') out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) self._mask = Buffer(out_mask_arr) self._nvcategory = None self._indices = None
def column_select_by_position(column, positions): """Select by a series of dtype int64 indicating positions. Returns (selected_column, selected_positions) """ from cudf.dataframe.numerical import NumericalColumn assert column.null_count == 0 selvals = cudautils.gather(column.data.to_gpu_array(), positions.data.to_gpu_array()) selected_values = column.replace(data=Buffer(selvals)) selected_index = Buffer(positions.data.to_gpu_array()) return selected_values, NumericalColumn(data=selected_index, dtype=selected_index.dtype)
def sort_by_values(self, ascending=True, na_position="last"): sort_inds = get_sorted_inds(self, ascending, na_position) col_keys = cudautils.gather(data=self.data.mem, index=sort_inds.data.mem) mask = None if self.mask: mask = self._get_mask_as_column()\ .take(sort_inds.data.to_gpu_array()).as_mask() mask = Buffer(mask) col_keys = self.replace(data=Buffer(col_keys), mask=mask, null_count=self.null_count, dtype=self.dtype) col_inds = self.replace(data=sort_inds.data, mask=sort_inds.mask, dtype=sort_inds.data.dtype) return col_keys, col_inds
def as_column(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) else: vals = rmm.device_array(0, dtype=self.dtype) return NumericalColumn(data=Buffer(vals), dtype=vals.dtype, name=self.name)
def column_select_by_boolmask(column, boolmask): """Select by a boolean mask to a column. Returns (selected_column, selected_positions) """ from cudf.dataframe.numerical import NumericalColumn assert column.null_count == 0 # We don't properly handle the boolmask yet boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array()) indices = cudautils.arange(len(boolmask)) _, selinds = cudautils.copy_to_dense(indices, mask=boolbits) _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(), mask=boolbits) selected_values = column.replace(data=Buffer(selvals)) selected_index = Buffer(selinds) return selected_values, NumericalColumn(data=selected_index, dtype=selected_index.dtype)
def indices(self): if self._indices is None: out_dev_arr = rmm.device_array(self.nvcategory.size(), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self.nvcategory.values(devptr=ptr) self._indices = Buffer(out_dev_arr) return self._indices
def _sortjoin(self, other, how='left', return_indexers=False): """Join with another column. When the column is a index, set *return_indexers* to obtain the indices for shuffling the remaining columns. """ from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') lkey, largsort = self.sort_by_values(True) rkey, rargsort = other.sort_by_values(True) with _gdf.apply_join( [lkey], [rkey], how=how, method='sort') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( lkey.to_gpu_array(), rkey.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = lkey.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(largsort), lidx), gather(Series(rargsort), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def round(self, decimals=0): mask = None if self.has_null_mask: mask = self.nullmask rounded = cudautils.apply_round(self.data.mem, decimals) return NumericalColumn(data=Buffer(rounded), mask=mask, dtype=self.dtype)
def take(self, indices, ignore_index=False): """Return Column by taking values from the corresponding *indices*. """ indices = Buffer(indices).to_gpu_array() # Handle zero size if indices.size == 0: return self.copy() data = cudautils.gather(data=self._data.to_gpu_array(), index=indices) if self._mask: mask = self._get_mask_as_column().take(indices).as_mask() mask = Buffer(mask) else: mask = None return self.replace(data=Buffer(data), mask=mask)
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size") else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( columnops.build_column(Buffer(res[idx]), dtype=res[idx].dtype, mask=mask)) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def normalize_binop_value(self, other): ary = utils.scalar_broadcast_to(self._encode(other), shape=len(self), dtype=self.data.dtype) col = self.replace(data=Buffer(ary), dtype=self.dtype, categories=self._categories, ordered=self._ordered) return col