def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.cudautils import fill_value from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.core.column import as_column from cudf.utils.cudautils import zeros gather_map = zeros(size, dtype="int32") scalar_str_col = as_column(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array((size, ), dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def scalar_broadcast_to(scalar, shape, dtype): from cudf.utils.cudautils import fill_value if not isinstance(shape, tuple): shape = (shape, ) da = rmm.device_array(shape, dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def allocate_mask(self, all_valid=True): """Return a new Column with a newly allocated mask buffer. If ``all_valid`` is True, the new mask is set to all valid. If ``all_valid`` is False, the new mask is set to all null. """ nelem = len(self) mask_sz = utils.calc_chunk_size(nelem, utils.mask_bitsize) mask = cuda.device_array(mask_sz, dtype=utils.mask_dtype) cudautils.fill_value(mask, 0xff if all_valid else 0) return self.set_mask(mask=mask, null_count=0 if all_valid else nelem)
def scalar_broadcast_to(scalar, shape, dtype): from cudf.utils.cudautils import fill_value if not isinstance(shape, tuple): shape = (shape, ) if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.dataframe.string import StringColumn from cudf.utils.cudautils import zeros gather_map = zeros(shape[0], dtype='int32') scalar_str_col = StringColumn(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array(shape, dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def column_empty(row_count, dtype, masked, categories=None): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) if masked: mask = cudautils.make_mask(row_count) cudautils.fill_value(mask, 0) else: mask = None if ( categories is not None or pd.api.types.is_categorical_dtype(dtype) ): mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) dtype = 'category' elif dtype.kind in 'OU': if row_count == 0: data = nvstrings.to_device([]) else: mem = rmm.device_array((row_count,), dtype='float64') data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True) else: mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) if mask is not None: mask = Buffer(mask) from cudf.dataframe.columnops import build_column return build_column(data, dtype, mask, categories)
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)