def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: if other is None: return other if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other # expensive device-host transfer just to # adjust the dtype other = other.value elif isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: if isinstance(other, cudf.Scalar): return other other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other_dtype = np.dtype("float32") other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): other = np.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to( other, size=len(self), dtype=other_dtype ) return column.build_column( data=Buffer(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError(f"cannot broadcast {type(other)}")
def fillna(self, fill_value): col = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) fill_value = fill_value.astype(dtype) col = col.astype(dtype) elif not isinstance(fill_value, Scalar): fill_value = np.timedelta64(fill_value) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(col, fill_value) if isinstance(fill_value, np.timedelta64) and np.isnat(fill_value): # If the value we are filling is np.timedelta64("NAT") # we set the same mask as current column. # However where there are "<NA>" in the # columns, their corresponding locations # in base_data will contain min(int64) values. return column.build_column( data=result.base_data, dtype=result.dtype, mask=self.base_mask, size=result.size, offset=result.offset, children=result.base_children, ) return result
def sort_by_values(self, ascending=True, na_position="last"): sort_inds = get_sorted_inds(self, ascending, na_position) col_keys = self[sort_inds] col_inds = column.build_column(sort_inds.data, dtype=sort_inds.dtype, mask=sort_inds.mask) return col_keys, col_inds
def normalize_binop_value(self, other): if other is None: return other other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other = np.dtype("float32").type(other) other_dtype = other.dtype if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): other = np.dtype(other_dtype).type(other) return other else: ary = utils.scalar_broadcast_to(other, size=len(self), dtype=other_dtype) return column.build_column( data=Buffer.from_array_lik(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError("cannot broadcast {}".format(type(other)))
def fillna(self, fill_value): """ Fill null values with *fill_value* """ if np.isscalar(fill_value): # castsafely to the same dtype as self fill_value_casted = self.dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( "Cannot safely cast non-equivalent {} to {}".format( type(fill_value).__name__, self.dtype.name ) ) fill_value = fill_value_casted else: fill_value = column.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self if is_integer_dtype(self.dtype): fill_value = _safe_cast_to_int(fill_value, self.dtype) else: fill_value = fill_value.astype(self.dtype) result = libcudfxx.replace.replace_nulls(self, fill_value) result = column.build_column( result.base_data, result.dtype, mask=None, offset=result.offset, size=result.size, ) return result
def deserialize(cls, header: dict, frames: list) -> CategoricalColumn: n_dtype_frames = header["dtype_frames_count"] dtype = CategoricalDtype.deserialize(header["dtype"], frames[:n_dtype_frames]) n_data_frames = header["data_frames_count"] column_type = pickle.loads(header["data"]["type-serialized"]) data = column_type.deserialize( header["data"], frames[n_dtype_frames:n_dtype_frames + n_data_frames], ) mask = None if "mask" in header: mask = Buffer.deserialize(header["mask"], [frames[n_dtype_frames + n_data_frames]]) return cast( CategoricalColumn, column.build_column( data=None, dtype=dtype, mask=mask, children=(column.as_column(data.base_data, dtype=data.dtype), ), ), )
def as_numerical(self) -> NumericalColumn: return cast( cudf.core.column.NumericalColumn, column.build_column(data=self.codes.data, dtype=self.codes.dtype, mask=self.mask), )
def deserialize(cls, header, frames): # Get null mask if header["null_count"] > 0: mask = Buffer(frames[-1]) else: mask = None # Deserialize child columns children = [] f = 0 for h in header["subheaders"]: fcount = h["frame_count"] child_frames = frames[f:f + fcount] column_type = pickle.loads(h["type-serialized"]) children.append(column_type.deserialize(h, child_frames)) f += fcount # Materialize list column return column.build_column( data=None, dtype=pickle.loads(header["dtype"]), mask=mask, children=tuple(children), size=header["size"], )
def as_numerical(self): return column.build_column( data=self.base_data, dtype=np.int64, mask=self.base_mask, offset=self.offset, size=self.size, )
def get_dt_field(self, field): out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object # but we need a NumericalColumn for GenericIndex.. # how should this be handled? out_column = column.build_column( data=out_column.data, dtype=out_column.dtype, mask=out_column.mask ) return as_index(out_column, name=self.name)
def as_numerical_column(self, dtype, **kwargs): casted = libcudf.typecast.cast(self, dtype) return column.build_column( data=casted.data, dtype=casted.dtype, mask=casted.mask, size=casted.size, offset=casted.offset, )
def fillna(self, fill_value): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) result = column.build_column(result.data, result.dtype, mask=None) return result
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, StringColumn): values = values.copy() elif isinstance(values, StringIndex): values = values._values.copy() else: values = column.build_column(nvstrings.to_device(values), dtype="object") super(StringIndex, self).__init__(values, **kwargs)
def round(self, decimals=0): if decimals < 0: msg = "Decimal values < 0 are not yet supported." raise NotImplementedError(msg) if np.issubdtype(self.dtype, np.integer): return self data = Buffer(cudautils.apply_round(self.data_array_view, decimals)) return column.build_column(data=data, dtype=self.dtype, mask=self.mask)
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._parent.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = Series(new_cats).drop_duplicates()._column cur_codes = self.codes cur_order = cudautils.arange(len(cur_codes)) old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order").reset_index(True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered) if kwargs.get("inplace", False): self._parent.data = None self._parent.mask = new_codes.mask self._parent.dtype = new_dtype self._parent.children = (new_codes, ) return None return column.build_column( data=None, dtype=new_dtype, mask=new_codes.mask, children=(new_codes, ), )
def as_numerical(self) -> "cudf.core.column.NumericalColumn": return cast( "cudf.core.column.NumericalColumn", column.build_column( data=self.base_data, dtype=np.int64, mask=self.base_mask, offset=self.offset, size=self.size, ), )
def children(self): if self._children is None: codes_column = self.base_children[0] codes_column = column.build_column( data=codes_column.base_data, dtype=codes_column.dtype, mask=codes_column.base_mask, size=self.size, offset=self.offset, ) self._children = (codes_column, ) return self._children
def children(self): if self._children is None: if self.base_children is None or (self.offset == 0 and self.base_children[0].size == (self.size + 1)): self._children = self.base_children else: # First get the base columns for chars and offsets chars_column = self.base_children[1] offsets_column = self.base_children[0] # Shift offsets column by the parent offset. offsets_column = column.build_column( data=offsets_column.base_data, dtype=offsets_column.dtype, mask=offsets_column.base_mask, size=self.size + 1, offset=self.offset, ) # Now run a subtraction binary op to shift all of the offsets # by the respective number of characters relative to the # parent offset chars_offset = offsets_column[0] offsets_column = offsets_column.binary_operator( "sub", offsets_column.dtype.type(chars_offset)) # Shift the chars offset by the new first element of the # offsets column chars_size = offsets_column[self.size] chars_column = column.build_column( data=chars_column.base_data, dtype=chars_column.dtype, mask=chars_column.base_mask, size=chars_size, offset=chars_offset, ) self._children = (offsets_column, chars_column) return self._children
def children(self): if self._children is None: codes_column = self.base_children[0] buf = Buffer(codes_column.base_data) buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) buf.size = self.size * codes_column.dtype.itemsize codes_column = column.build_column( data=buf, dtype=codes_column.dtype, size=self.size, ) self._children = (codes_column,) return self._children
def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): count = 1000 gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls) if pessimistic: # pessimistically combine the null masks gdf_series_expected = gdf_series_a * gdf_series_b else: # optimistically ignore the null masks a = cudf.Series(column.build_column(gdf_series_a.data, dtype)) b = cudf.Series(column.build_column(gdf_series_b.data, dtype)) gdf_series_expected = a * b df_expected = cudf.DataFrame( { "a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c, "out": gdf_series_expected, } ) df_original = cudf.DataFrame( {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c} ) df_actual = df_original.apply_rows( _kernel_multiply, ["a", "b"], {"out": dtype}, {}, pessimistic_nulls=pessimistic, ) assert_eq(df_expected, df_actual)
def children(self) -> Tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] buf = Buffer(codes_column.base_data) buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) buf.size = self.size * codes_column.dtype.itemsize codes_column = cast( cudf.core.column.NumericalColumn, column.build_column( data=buf, dtype=codes_column.dtype, size=self.size, ), ) self._children = (codes_column,) return self._children
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size" ) else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( column.build_column( Buffer(res[idx]), dtype=res[idx].dtype, mask=mask ) ) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def sort_by_values(self, ascending=True, na_position="last"): if na_position == "last": nullfirst = False elif na_position == "first": nullfirst = True idx_dev_arr = rmm.device_array(len(self), dtype="int32") dev_ptr = libcudf.cudf.get_ctype_ptr(idx_dev_arr) self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr) col_inds = column.build_column(Buffer(idx_dev_arr), idx_dev_arr.dtype, mask=None) col_keys = self[col_inds.data.mem] return col_keys, col_inds
def normalize_binop_value(self, other): if isinstance(other, dt.datetime): other = np.datetime64(other) if isinstance(other, pd.Timestamp): m = _numpy_to_pandas_conversion[self.time_unit] ary = utils.scalar_broadcast_to(other.value * m, shape=len(self), dtype=self.dtype) elif isinstance(other, np.datetime64): other = other.astype(self.dtype) ary = utils.scalar_broadcast_to(other, size=len(self), dtype=self.dtype) else: raise TypeError("cannot broadcast {}".format(type(other))) return column.build_column(data=Buffer(ary), dtype=self.dtype)
def deserialize(cls, header, frames): n_dtype_frames = header["dtype_frames_count"] dtype = CategoricalDtype.deserialize(header["dtype"], frames[:n_dtype_frames]) n_data_frames = header["data_frames_count"] column_type = pickle.loads(header["data"]["type"]) data = column_type.deserialize( header["data"], frames[n_dtype_frames:n_dtype_frames + n_data_frames], ) mask = None if header["frame_count"] > n_dtype_frames + n_data_frames: mask = Buffer(frames[n_dtype_frames + n_data_frames]) return column.build_column(data=None, dtype=dtype, mask=mask, children=(data, ))
def deserialize(cls, header, frames): # Deserialize the mask, value, and offset frames buffers = [Buffer(each_frame) for each_frame in frames] if header["null_count"] > 0: nbuf = buffers[2] else: nbuf = None children = [] for h, b in zip(header["subheaders"], buffers[:2]): column_type = pickle.loads(h["type"]) children.append(column_type.deserialize(h, [b])) col = column.build_column(data=None, dtype="str", mask=nbuf, children=tuple(children)) return col
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.core.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype="int32") ptr = libcudf.cudf.get_ctype_ptr(out_dev_arr) self._parent.nvstrings.len(ptr) mask = None if self._parent.has_nulls: mask = self._parent.mask col = column.build_column( Buffer(out_dev_arr), np.dtype("int32"), mask=mask ) return Series(col, index=self._index, name=self._name)
def fillna(self, fill_value, inplace=False): """ Fill null values with *fill_value* """ if not self.has_null_mask: return self fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: if fill_value == self.default_na_value(): fill_value = self.data.dtype.type(fill_value) else: try: fill_value = self._encode(fill_value) fill_value = self.data.dtype.type(fill_value) except (ValueError) as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) # TODO: only required if fill_value has a subset of the categories: fill_value = fill_value.cat()._set_categories( self._categories, is_unique=True ) fill_value = column.as_column(fill_value.data).astype( self.data.dtype ) result = libcudf.replace.replace_nulls(self, fill_value) result = column.build_column( result.data, "category", result.mask, categories=self._categories ) return self._mimic_inplace(result.replace(mask=None), inplace)
def as_numerical(self): return column.build_column(data=self.codes.data, dtype=self.codes.dtype, mask=self.mask)
def _find_segments(self): seg, markers = cudautils.find_segments(self.gpu_values) return ( column.build_column(data=Buffer(seg), dtype=seg.dtype), markers, )