def _check_and_cast_columns_with_other( source_col: ColumnBase, other: Union[ScalarLike, ColumnBase], inplace: bool, ) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: """ Returns type-casted column `source_col` & scalar `other_scalar` based on `inplace` parameter. """ if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): return source_col, other if cudf.utils.dtypes.is_scalar(other): device_obj = _normalize_scalars(source_col, other) else: device_obj = other if other is None: return source_col, device_obj elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col): raise TypeError( "cudf does not support mixed types, please type-cast " "the column of dataframe/series and other " "to same dtypes." ) if inplace: if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype): warnings.warn( f"Type-casting from {device_obj.dtype} " f"to {source_col.dtype}, there could be potential data loss" ) return source_col, device_obj.astype(source_col.dtype) else: if ( cudf.utils.dtypes.is_scalar(other) and cudf.utils.dtypes._is_non_decimal_numeric_dtype( source_col.dtype ) and cudf.utils.dtypes._can_cast(other, source_col.dtype) ): common_dtype = source_col.dtype return ( source_col.astype(common_dtype), cudf.Scalar(other, dtype=common_dtype), ) else: common_dtype = cudf.utils.dtypes.find_common_type( [ source_col.dtype, np.min_scalar_type(other) if cudf.utils.dtypes.is_scalar(other) else other.dtype, ] ) if cudf.utils.dtypes.is_scalar(device_obj): device_obj = cudf.Scalar(other, dtype=common_dtype) else: device_obj = device_obj.astype(common_dtype) return source_col.astype(common_dtype), device_obj
def _one_hot_encode_column( column: ColumnBase, categories: ColumnBase, prefix: Optional[str], prefix_sep: Optional[str], dtype: Optional[Dtype], ) -> Dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with `prefix`, separated with category name with `prefix_sep`. The encoding columns maybe coerced into `dtype`. """ if isinstance(column, CategoricalColumn): if column.size == column.null_count: column = column_empty_like(categories, newsize=column.size) else: column = column._get_decategorized_column() if column.size * categories.size >= np.iinfo("int32").max: raise ValueError( "Size limitation exceeded: column.size * category.size < " "np.iinfo('int32').max. Consider reducing size of category") data = one_hot_encode(column, categories) if prefix is not None and prefix_sep is not None: data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} if dtype: data = {k: v.astype(dtype) for k, v in data.items()} return data
def from_arrow(cls, array): """Create from PyArrow Array/ChunkedArray. Parameters ---------- array : PyArrow Array/ChunkedArray PyArrow Object which has to be converted. Raises ------ TypeError for invalid input type. Returns ------- SingleColumnFrame Examples -------- >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) StringIndex(['a' 'b' None], dtype='object') >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) 0 a 1 b 2 <NA> dtype: object """ return cls(ColumnBase.from_arrow(array))
def cov(self, other: ColumnBase) -> float: if (len(self) == 0 or len(other) == 0 or (len(self) == 1 and len(other) == 1)): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) result = (self - self.mean()) * (other - other.mean()) cov_sample = result.sum() / (len(self) - 1) return cov_sample
def _match_categorical_dtypes(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Tuple[ColumnBase, ColumnBase]: # cast the keys lcol and rcol to a common dtype # when at least one of them is a categorical type ltype, rtype = lcol.dtype, rcol.dtype if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance( rcol, cudf.core.column.CategoricalColumn): # if both are categoricals, logic is complicated: return _match_categorical_dtypes_both(lcol, rcol, how) if isinstance(ltype, CategoricalDtype): if how in {"left", "leftsemi", "leftanti"}: return lcol, rcol.astype(ltype) common_type = ltype.categories.dtype elif isinstance(rtype, CategoricalDtype): common_type = rtype.categories.dtype return lcol.astype(common_type), rcol.astype(common_type)
def corr(self, other: ColumnBase) -> float: if len(self) == 0 or len(other) == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) cov = self.cov(other) lhs_std, rhs_std = self.std(), other.std() if not cov or lhs_std == 0 or rhs_std == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) return cov / lhs_std / rhs_std
def _concat(cls, objs): data = ColumnBase._concat([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names else: name = None result = as_index(data) result.name = name return result
def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Tuple[ColumnBase, ColumnBase]: # returns the common dtype that lcol and rcol should be casted to, # before they can be used as left and right join keys. # If no casting is necessary, returns None common_type = None # cast the keys lcol and rcol to a common dtype ltype = lcol.dtype rtype = rcol.dtype # if either side is categorical, different logic if isinstance(ltype, CategoricalDtype) or isinstance( rtype, CategoricalDtype): return _match_categorical_dtypes(lcol, rcol, how) if pd.api.types.is_dtype_equal(ltype, rtype): return lcol, rcol if isinstance(ltype, cudf.Decimal64Dtype) or isinstance( rtype, cudf.Decimal64Dtype): raise TypeError( "Decimal columns can only be merged with decimal columns " "of the same precision and scale") if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = (max(ltype, rtype) if ltype.kind == rtype.kind else np.find_common_type([], (ltype, rtype))) elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( rtype, np.datetime64): common_type = max(ltype, rtype) if how == "left": if rcol.fillna(0).can_cast_safely(ltype): return lcol, rcol.astype(ltype) else: warnings.warn(f"Can't safely cast column from {rtype} to {ltype}, " "upcasting to {common_type}.") return lcol.astype(common_type), rcol.astype(common_type)
def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Tuple[ColumnBase, ColumnBase]: # Casts lcol and rcol to a common dtype for use as join keys. If no casting # is necessary, they are returned as is. common_type = None # cast the keys lcol and rcol to a common dtype ltype = lcol.dtype rtype = rcol.dtype # if either side is categorical, different logic left_is_categorical = isinstance(ltype, CategoricalDtype) right_is_categorical = isinstance(rtype, CategoricalDtype) if left_is_categorical and right_is_categorical: return _match_categorical_dtypes_both(cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how) elif left_is_categorical or right_is_categorical: if left_is_categorical: if how in {"left", "leftsemi", "leftanti"}: return lcol, rcol.astype(ltype) common_type = ltype.categories.dtype else: common_type = rtype.categories.dtype return lcol.astype(common_type), rcol.astype(common_type) if is_dtype_equal(ltype, rtype): return lcol, rcol if is_decimal_dtype(ltype) or is_decimal_dtype(rtype): raise TypeError( "Decimal columns can only be merged with decimal columns " "of the same precision and scale") if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = (max(ltype, rtype) if ltype.kind == rtype.kind else np.find_common_type([], (ltype, rtype))) elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( rtype, np.datetime64): common_type = max(ltype, rtype) if how == "left": if rcol.fillna(0).can_cast_safely(ltype): return lcol, rcol.astype(ltype) else: warnings.warn(f"Can't safely cast column from {rtype} to {ltype}, " f"upcasting to {common_type}.") return lcol.astype(common_type), rcol.astype(common_type)
def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase: """ Cast given NumericalColumn to given integer dtype safely. """ assert is_integer_dtype(dtype) if col.dtype == dtype: return col new_col = col.astype(dtype) if (new_col == col).all(): return new_col else: raise TypeError( f"Cannot safely cast non-equivalent " f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}")
def fillna(self, fill_value=None, method=None): if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) col = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) fill_value = fill_value.astype(dtype) col = col.astype(dtype) if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) return ColumnBase.fillna(col, fill_value) else: return super().fillna(method=method)