def set_categories(self, new_categories, **kwargs): """Returns a new Series with the categories set to the specified *new_categories*.""" data = self._parent new_categories = column.as_column(new_categories) # when called with rename=True, the pandas behavior is # to replace the current category values with the new # categories. if kwargs.pop("rename", False): # enforce same length if len(new_categories) != len(data.categories): raise ValueError("new_categories must have the same " "number of items as old categories") elif not kwargs.get("inplace", False): # return a copy if inplace=False data = data.replace(categories=new_categories, **kwargs) else: # mutate inplace if inplace=True data.categories = new_categories ordered = kwargs.get("ordered", self.ordered) data._dtype = CategoricalDtype( categories=column.as_column(new_categories), ordered=ordered, ) elif not self._categories_equal(new_categories, **kwargs): data = self._set_categories(new_categories, **kwargs) if data is not None: from cudf import Series return Series(data=data)
def build_categorical_column( categories, codes, mask=None, size=None, offset=0, ordered=None ): """ Build a CategoricalColumn Parameters ---------- categories : Column Column of categories codes : Column Column of codes, the size of the resulting Column will be the size of `codes` mask : Buffer Null mask size : int, optional offset : int, optional ordered : bool Indicates whether the categories are ordered """ dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered) return build_column( data=None, dtype=dtype, mask=mask, size=size, offset=offset, children=(as_column(codes),), )
def as_categorical_column(self, dtype, **kwargs): if isinstance(dtype, str) and dtype == "category": return self if ( isinstance( dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) ) and (dtype.categories is None) and (dtype.ordered is None) ): return self if isinstance(dtype, pd.CategoricalDtype): dtype = CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) if not isinstance(self.categories, type(dtype.categories._values)): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) return self.cat().set_categories( new_categories=dtype.categories, ordered=dtype.ordered )
def __init__(self, **kwargs): """ Parameters ---------- data : Buffer The code values mask : Buffer; optional The validity mask null_count : int; optional The number of null values in the mask. categories : iterable The categories ordered : bool whether the categorical has a logical ordering (e.g. less than) """ ordered = bool(kwargs.pop("ordered")) categories = kwargs.pop("categories", []) # Default to String dtype if len(categories) == 0, like pandas does categories = ( column.as_column(categories) if len(categories) > 0 else column.column_empty(0, np.dtype("object"), masked=False) ) dtype = CategoricalDtype( categories=column.as_column(categories), ordered=ordered ) kwargs.update({"dtype": dtype}) super(CategoricalColumn, self).__init__(**kwargs) self._categories = categories self._ordered = ordered
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._parent.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = Series(new_cats).drop_duplicates()._column cur_codes = self.codes cur_order = cudautils.arange(len(cur_codes)) old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order").reset_index(True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered) if kwargs.get("inplace", False): self._parent.data = None self._parent.mask = new_codes.mask self._parent.dtype = new_dtype self._parent.children = (new_codes, ) return None return column.build_column( data=None, dtype=new_dtype, mask=new_codes.mask, children=(new_codes, ), )
def categories(self, value): self.dtype = CategoricalDtype(categories=value, ordered=self.dtype.ordered)
def set_categories( self, new_categories, ordered=None, rename=False, inplace=False, ): """ Set the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to null). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to null or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes. Parameters ---------- new_categories : list-like The categories in new order. ordered : bool, default None Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False Whether or not the `new_categories` should be considered as a rename of the old categories or as reordered categories. inplace : bool, default False Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. Returns ------- cat Categorical with reordered categories or None if inplace. Examples -------- >>> import cudf >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category') >>> s 0 1 1 1 2 2 3 10 4 2 5 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.set_categories([1, 10]) 0 1 1 1 2 null 3 10 4 null 5 10 dtype: category Categories (2, int64): [1, 10] >>> s.cat.set_categories([1, 10], inplace=True) >>> s 0 1 1 1 2 null 3 10 4 null 5 10 dtype: category Categories (2, int64): [1, 10] """ ordered = ordered if ordered is not None else self.ordered new_categories = column.as_column(new_categories) if isinstance(new_categories, CategoricalColumn): new_categories = new_categories.categories # when called with rename=True, the pandas behavior is # to replace the current category values with the new # categories. if rename: # enforce same length if len(new_categories) != len(self._column.categories): raise ValueError("new_categories must have the same " "number of items as old categories") out_col = column.build_categorical_column( categories=new_categories, codes=self._column.base_children[0], mask=self._column.base_mask, size=self._column.size, offset=self._column.offset, ordered=ordered, ) else: out_col = self._column if not (type(out_col.categories) is type(new_categories)): # If both categories are of different Column types, # return a column full of Nulls. out_col = _create_empty_categorical_column( self._column, CategoricalDtype(categories=new_categories, ordered=ordered), ) elif (not self._categories_equal(new_categories, ordered=ordered) or not self.ordered == ordered): out_col = self._set_categories( self._column.categories, new_categories, ordered=ordered, ) return self._return_or_inplace(out_col, inplace=inplace)
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool,optional If True (default), treat NaN values in arbitrary as null. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR) ) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize ) nbuf = None if arbitrary.null_count() > 0: mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize) nbuf = Buffer.empty(mask_size) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column( data=None, dtype="object", mask=nbuf, children=children ) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype) if ( data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0 ): if nan_as_null: mask = libcudf.unaryops.nans_to_nulls(data) data = data.set_mask(mask) elif data.dtype.kind == "M": null = column_empty_like(data, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(arbitrary), dtype=arbitrary.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)), dtype=arbitrary.dtype, ), null, ) data = datetime.DatetimeColumn( data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask ) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) dtype = np.dtype(desc["typestr"]) col = build_column(data, dtype=dtype, mask=mask) return col elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary ) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn( mask=nbuf, children=children, size=pa_size, offset=pa_offset ) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype() ) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype( categories=categories, ordered=arbitrary.type.ordered ) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes,), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]" ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary ) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: data = data.fillna(np.nan) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) else: try: data = as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and ( len( [ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ] ) == 0 ): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = ( Series(ColumnBase._concat([o.categories for o in objs])) .drop_duplicates() ._column ) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError( "Result of concat cannot have " "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR) ) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR ) ) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data_dtype = head.codes.dtype data = None children = (column_empty(newsize, dtype=head.codes.dtype),) else: data_dtype = head.dtype data = Buffer.empty(size=newsize * data_dtype.itemsize) children = () # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column( data=data, dtype=head.dtype, mask=mask, children=children ) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col
def _concat(cls, objs, dtype=None): if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: o.valid_count > 0, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not is_numerical_dtype(o.dtype) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if obj.valid_count > 0: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype if obj.valid_count == 0: objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) else: raise ValueError("All columns must be the same type") cats = None is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) # Combine CategoricalColumn categories if is_categorical: # Combine and de-dupe the categories cats = (cudf.concat([o.cat().categories for o in objs ]).to_series().drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] # Map `objs` into a list of the codes until we port Categorical to # use the libcudf++ Category data type. objs = [o.cat().codes._column for o in objs] head = head.cat().codes._column newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError("Result of concat cannot have " "size > {}".format( libcudfxx.MAX_COLUMN_SIZE_STR)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] # Perform the actual concatenation if newsize > 0: col = libcudfxx.concat.concat_columns(objs) else: col = column_empty(0, head.dtype, masked=True) if is_categorical: col = build_categorical_column( categories=cats, codes=as_column(col.base_data, dtype=col.dtype), mask=col.base_mask, size=col.size, offset=col.offset, ) return col