def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: if other is None: return other if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other # expensive device-host transfer just to # adjust the dtype other = other.value elif isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: if isinstance(other, cudf.Scalar): return other other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other_dtype = np.dtype("float32") other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): other = np.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to( other, size=len(self), dtype=other_dtype ) return column.build_column( data=Buffer(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError(f"cannot broadcast {type(other)}")
def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *value*. """ to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace ) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement ) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to( replacement[0], (len(to_replace_col),), self.dtype ) ) replaced = self.copy() to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, replaced ) return libcudf.replace.replace( replaced, to_replace_col, replacement_col )
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def find_and_replace( self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ # If all of `to_replace`/`replacement` are `None`, # dtype of `to_replace_col`/`replacement_col` # is inferred as `string`, but this is a valid # float64 column too, Hence we will need to type-cast # to self.dtype. to_replace_col = column.as_column(to_replace) if to_replace_col.null_count == len(to_replace_col): to_replace_col = to_replace_col.astype(self.dtype) replacement_col = column.as_column(replacement) if replacement_col.null_count == len(replacement_col): replacement_col = replacement_col.astype(self.dtype) if type(to_replace_col) != type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}") if not isinstance(to_replace_col, NumericalColumn) and not isinstance( replacement_col, NumericalColumn): return self.copy() to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to(replacement[0], (len(to_replace_col), ), self.dtype)) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, self) df = cudf.DataFrame._from_data({ "old": to_replace_col, "new": replacement_col }) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( df._data["new"][df._data["old"].isnull()][0]) df = df.dropna(subset=["old"]) return libcudf.replace.replace(replaced, df._data["old"], df._data["new"])
def normalize_binop_value(self, other): if other is None: return other other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other = np.dtype("float32").type(other) other_dtype = other.dtype if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): other = np.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to(other, size=len(self), dtype=other_dtype) return column.build_column( data=Buffer.from_array_lik(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError("cannot broadcast {}".format(type(other)))
def replace(self, to_replace, value): """ Replace values given in *to_replace* with *value*. Parameters ---------- to_replace : numeric, str or list-like Value(s) to replace. * numeric or str: - values equal to *to_replace* will be replaced with *value* * list of numeric or str: - If *value* is also list-like, *to_replace* and *value* must be of same length. value : numeric, str, list-like, or dict Value(s) to replace `to_replace` with. See also -------- Series.fillna Returns ------- result : Series Series after replacement. The mask and index are preserved. """ if not is_scalar(to_replace): if is_scalar(value): value = utils.scalar_broadcast_to( value, (len(to_replace),), np.dtype(type(value)) ) else: if not is_scalar(value): raise TypeError( "Incompatible types '{}' and '{}' " "for *to_replace* and *value*.".format( type(to_replace).__name__, type(value).__name__ ) ) to_replace = [to_replace] value = [value] if len(to_replace) != len(value): raise ValueError( "Replacement lists must be" "of same length." "Expected {}, got {}.".format(len(to_replace), len(value)) ) if is_dict_like(to_replace) or is_dict_like(value): raise TypeError("Dict-like args not supported in Series.replace()") result = self._column.find_and_replace(to_replace, value) return self._copy_construct(data=result)
def normalize_binop_value(self, other): ary = utils.scalar_broadcast_to(self._encode(other), shape=len(self), dtype=self.data.dtype) col = self.replace(data=Buffer(ary), dtype=self.dtype, categories=self._categories, ordered=self._ordered) return col
def get_str_replacement_series(replacement, bool_mask): """ Get replacement series with replacement at Places marked by bool mask and empty other wise """ word_ser = cudf.Series(scalar_broadcast_to("", size=len(bool_mask))) word_ser.iloc[bool_mask] = replacement return word_ser
def normalize_binop_value(self, other): if isinstance(other, dt.datetime): other = np.datetime64(other) if isinstance(other, pd.Timestamp): m = _numpy_to_pandas_conversion[self.time_unit] ary = utils.scalar_broadcast_to(other.value * m, shape=len(self), dtype=self.dtype) elif isinstance(other, np.datetime64): other = other.astype(self.dtype) ary = utils.scalar_broadcast_to(other, size=len(self), dtype=self.dtype) else: raise TypeError("cannot broadcast {}".format(type(other))) return column.build_column(data=Buffer(ary), dtype=self.dtype)
def normalize_binop_value(self, other): if isinstance(other, column.Column): return other.astype(self.dtype) elif isinstance(other, str) or other is None: col = utils.scalar_broadcast_to(other, shape=len(self), dtype="object") return self.replace(data=col.data) else: raise TypeError('cannot broadcast {}'.format(type(other)))
def normalize_binop_value(self, other): other_dtype = np.min_scalar_type(other) if other_dtype.kind in 'biuf': other_dtype = np.promote_types(self.dtype, other_dtype) ary = utils.scalar_broadcast_to(other, shape=len(self), dtype=other_dtype) return self.replace(data=Buffer(ary), dtype=ary.dtype) else: raise TypeError('cannot broadcast {}'.format(type(other)))
def normalize_binop_value(self, other): ary = utils.scalar_broadcast_to(self._encode(other), size=len(self), dtype=self.codes.dtype) col = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(ary), mask=self.mask, ordered=self.dtype.ordered, ) return col
def normalize_binop_value(self, other): if isinstance(other, dt.datetime): other = np.datetime64(other) if isinstance(other, pd.Timestamp): ary = utils.scalar_broadcast_to(other.value * self._pandas_conversion_factor, shape=len(self), dtype=self._npdatetime64_dtype) elif isinstance(other, np.datetime64): other = other.astype(self._npdatetime64_dtype) ary = utils.scalar_broadcast_to(other, shape=len(self), dtype=self._npdatetime64_dtype) else: raise TypeError('cannot broadcast {}'.format(type(other))) buf = Buffer(ary) result = self.replace(data=buf, dtype=self.dtype) return result
def normalize_binop_value(self, other): other_dtype = np.min_scalar_type(other) if other_dtype.kind in 'biuf': other_dtype = np.promote_types(self.dtype, other_dtype) # Temporary workaround since libcudf doesn't support int16 ops if other_dtype == np.dtype('int16'): other_dtype = np.dtype('int32') ary = utils.scalar_broadcast_to(other, shape=len(self), dtype=other_dtype) return self.replace(data=Buffer(ary), dtype=ary.dtype) else: raise TypeError('cannot broadcast {}'.format(type(other)))
def find_and_replace( self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ to_replace_col = as_column(to_replace) replacement_col = as_column(replacement) if type(to_replace_col) != type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) if not isinstance(to_replace_col, NumericalColumn) and not isinstance( replacement_col, NumericalColumn ): return self.copy() to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace ) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement ) replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to( replacement[0], (len(to_replace_col),), self.dtype ) ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return replaced to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, replaced ) return libcudf.replace.replace( replaced, to_replace_col, replacement_col )
def find_and_replace( self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ to_replace_col = column.as_column(to_replace) replacement_col = column.as_column(replacement) if type(to_replace_col) != type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}") if not isinstance(to_replace_col, NumericalColumn) and not isinstance( replacement_col, NumericalColumn): return self.copy() to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement) replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to(replacement[0], (len(to_replace_col), ), self.dtype)) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return replaced to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, replaced) df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( df._data["new"][df._data["old"].isna()][0]) df = df.dropna(subset=["old"]) return libcudf.replace.replace(replaced, df["old"]._column, df["new"]._column)
def normalize_binop_value(self, other): other_dtype = np.min_scalar_type(other) if other_dtype.kind in "biuf": other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other = np.dtype("float32").type(other) other_dtype = other.dtype if other_dtype.kind in "u": other_dtype = min_signed_type(other) if np.isscalar(other): other = np.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to(other, shape=len(self), dtype=other_dtype) return self.replace(data=Buffer(ary), dtype=ary.dtype) else: raise TypeError("cannot broadcast {}".format(type(other)))
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column(arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool,optional If True (default), treat NaN values in arbitrary as null. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR) ) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize ) nbuf = None if arbitrary.null_count() > 0: mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize) nbuf = Buffer.empty(mask_size) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column( data=None, dtype="object", mask=nbuf, children=children ) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype) if ( data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0 ): if nan_as_null: mask = libcudf.unaryops.nans_to_nulls(data) data = data.set_mask(mask) elif data.dtype.kind == "M": null = column_empty_like(data, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(arbitrary), dtype=arbitrary.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)), dtype=arbitrary.dtype, ), null, ) data = datetime.DatetimeColumn( data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask ) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) dtype = np.dtype(desc["typestr"]) col = build_column(data, dtype=dtype, mask=mask) return col elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary ) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn( mask=nbuf, children=children, size=pa_size, offset=pa_offset ) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype() ) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype( categories=categories, ordered=arbitrary.type.ordered ) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes,), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]" ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary ) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: data = data.fillna(np.nan) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) else: try: data = as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)
def as_column(arbitrary, nan_as_null=True, dtype=None, name=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical * Object exposing ``__cuda_array_interface__`` Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input. - StringColumn for string input. - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index from cudf.bindings.cudf_cpp import np_to_pa_dtype if name is None and hasattr(arbitrary, "name"): name = arbitrary.name if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column( arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories, ) elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudf.bindings.utils.mask_from_devary(data) data = data.set_mask(mask) elif hasattr(arbitrary, "__cuda_array_interface__"): from cudf.bindings.cudf_cpp import count_nonzero_mask desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(desc) mask = _mask_from_cuda_array_interface_desc(desc) if mask is not None: nelem = len(data.mem) nnz = count_nonzero_mask(mask.mem, size=nelem) null_count = nelem - nnz else: null_count = 0 return build_column(data, dtype=data.dtype, mask=mask, name=name, null_count=null_count) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype="int8") else: sbuf = np.empty(0, dtype="int8") # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype="int32") # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype="int8") data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype()) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary, ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype="M8[ms]") data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype("M8[ms]"), ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, "dtype"): data_type = np_to_pa_dtype(arbitrary.dtype) # PyArrow can't construct date64 or date32 arrays from np # datetime types if pa.types.is_date64(data_type) or pa.types.is_date32(data_type): arbitrary = arbitrary.astype("int64") data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): data = as_column( pd.Series(arbitrary, dtype="category"), nan_as_null=nan_as_null, ) else: data = as_column( np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null, ) if hasattr(data, "name") and (name is not None): data.name = name return data