def normalize_binop_value( self, other: ScalarLike) -> Union[ColumnBase, ScalarLike]: if other is None: return other if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other # expensive device-host transfer just to # adjust the dtype other = other.value elif isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: if isinstance(other, cudf.Scalar): return other other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other_dtype = cudf.dtype("float32") other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): return cudf.dtype(other_dtype).type(other) else: ary = utils.scalar_broadcast_to(other, size=len(self), dtype=other_dtype) return column.build_column( data=Buffer(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError(f"cannot broadcast {type(other)}")
def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) max_int = np.iinfo(cudf.dtype("int64")).max max_dist = np.timedelta64( self.max().astype(cudf.dtype("int64"), copy=False), self_res) min_dist = np.timedelta64( self.min().astype(cudf.dtype("int64"), copy=False), self_res) self_delta_dtype = np.timedelta64(0, self_res).dtype if max_dist <= np.timedelta64(max_int, to_res).astype( self_delta_dtype) and min_dist <= np.timedelta64( max_int, to_res).astype(self_delta_dtype): return True else: return False elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"): # can safely cast to representation, or string return True else: return False
def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: if np.can_cast(cudf.dtype(lhs_dtype), cudf.dtype(rhs_dtype)): return rhs_dtype elif np.can_cast(cudf.dtype(rhs_dtype), cudf.dtype(lhs_dtype)): return lhs_dtype else: raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
def _binop_result_dtype_or_error(self, other, op): if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}: return np.bool_ out_dtype = get_allowed_combinations_for_operator( self.dtype, other.dtype, op ) # datetime handling if out_dtype in {"M", "m"}: if self.dtype.char in {"M", "m"} and other.dtype.char not in { "M", "m", }: return self.dtype if other.dtype.char in {"M", "m"} and self.dtype.char not in { "M", "m", }: return other.dtype else: if ( op == "__sub__" and self.dtype.char == other.dtype.char == "M" ): res, _ = np.datetime_data(max(self.dtype, other.dtype)) return cudf.dtype("m8" + f"[{res}]") return np.result_type(self.dtype, other.dtype) return cudf.dtype(out_dtype)
def test_null_series(nrows, dtype): size = 5 mask = utils.random_bitmask(size) data = cudf.Series(np.random.randint(1, 9, size)) column = data.set_mask(mask) sr = cudf.Series(column).astype(dtype) if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), dtype=cudf_dtypes_to_pandas_dtypes.get(cudf.dtype(dtype), cudf.dtype(dtype)), ) ps[sr.isnull().to_pandas()] = pd.NA else: ps = sr.to_pandas() pd.options.display.max_rows = int(nrows) psrepr = ps.__repr__() psrepr = psrepr.replace("NaN", "<NA>") psrepr = psrepr.replace("NaT", "<NA>") psrepr = psrepr.replace("None", "<NA>") if (dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long")): psrepr = psrepr.replace( str(sr._column.default_na_value()) + "\n", "<NA>\n") if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: psrepr = psrepr.replace("Int", "int") assert psrepr.split() == sr.__repr__().split() pd.reset_option("display.max_rows")
def test_null_series(nrows, dtype): size = 5 sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) sr[np.random.choice([False, True], size=size)] = None if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), dtype=np_dtypes_to_pandas_dtypes.get( cudf.dtype(dtype), cudf.dtype(dtype) ), ) ps[sr.isnull().to_pandas()] = pd.NA else: ps = sr.to_pandas() pd.options.display.max_rows = int(nrows) psrepr = ps.__repr__() psrepr = psrepr.replace("NaN", "<NA>") psrepr = psrepr.replace("NaT", "<NA>") psrepr = psrepr.replace("None", "<NA>") if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: psrepr = psrepr.replace("Int", "int") assert psrepr.split() == sr.__repr__().split() pd.reset_option("display.max_rows")
def _unaop_result_type_or_error(self, op): if op == "__neg__" and self.dtype == "bool": raise TypeError("Boolean scalars in cuDF do not support" " negation, use logical not") if op in {"__ceil__", "__floor__"}: if self.dtype.char in "bBhHf?": return cudf.dtype("float32") else: return cudf.dtype("float64") return self.dtype
def _can_cast(from_dtype, to_dtype): """ Utility function to determine if we can cast from `from_dtype` to `to_dtype`. This function primarily calls `np.can_cast` but with some special handling around cudf specific dtypes. """ if from_dtype in {None, cudf.NA}: return True if isinstance(from_dtype, type): from_dtype = cudf.dtype(from_dtype) if isinstance(to_dtype, type): to_dtype = cudf.dtype(to_dtype) # TODO : Add precision & scale checking for # decimal types in future if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): return True elif isinstance(to_dtype, np.dtype): if to_dtype.kind in {"i", "f", "u", "U", "O"}: return True else: return False elif isinstance(from_dtype, np.dtype): if isinstance(to_dtype, np.dtype): return np.can_cast(from_dtype, to_dtype) elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): if from_dtype.kind in {"i", "f", "u", "U", "O"}: return True else: return False elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): return True else: return False elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): # TODO: Add level based checks too once casting of # list columns is supported if isinstance(to_dtype, cudf.core.dtypes.ListDtype): return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) else: return False elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): return True elif isinstance(to_dtype, np.dtype): return np.can_cast(from_dtype._categories.dtype, to_dtype) else: return False else: return np.can_cast(from_dtype, to_dtype)
def _convert_str_col(col, errors, _downcast=None): """ Converts a string column to numeric column Converts to integer column if all strings are integer-like (isinteger.all) Otherwise, converts to float column if all strings are float-like ( isfloat.all) If error == 'coerce', fill non-numerics strings with null Looks ahead to ``downcast`` parameter, if the float may be casted to integer, then only process in float32 pipeline. Parameters ---------- col : The string column to convert, must be string dtype errors : {'raise', 'ignore', 'coerce'}, same as ``to_numeric`` _downcast : Same as ``to_numeric``, see description for use Returns ------- Converted numeric column """ if not is_string_dtype(col): raise TypeError("col must be string dtype.") is_integer = libstrings.is_integer(col) if is_integer.all(): return col.as_numerical_column(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) is_float = libstrings.is_float(col) if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( UserWarning("Downcasting from float to int will be " "limited by float32 precision.")) return col.as_numerical_column(dtype=cudf.dtype("f")) else: return col.as_numerical_column(dtype=cudf.dtype("d")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) non_numerics = is_float.unary_operator("not") col[non_numerics] = None return col else: raise ValueError("Unable to convert some strings to numerics.")
def find_common_type(dtypes): """ Wrapper over np.find_common_type to handle special cases Corner cases: 1. "M8", "M8" -> "M8" | "m8", "m8" -> "m8" Parameters ---------- dtypes : iterable, sequence of dtypes to find common types Returns ------- dtype : np.dtype optional, the result from np.find_common_type, None if input is empty """ if len(dtypes) == 0: return None # Aggregate same types dtypes = set(dtypes) if any(is_decimal_dtype(dtype) for dtype in dtypes): if all( is_decimal_dtype(dtype) or is_numerical_dtype(dtype) for dtype in dtypes): return _find_common_type_decimal( [dtype for dtype in dtypes if is_decimal_dtype(dtype)]) else: return cudf.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set(filter(lambda t: is_datetime_dtype(t), dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) td_dtypes = set( filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) common_dtype = np.find_common_type(list(dtypes), []) return cudf.dtype(common_dtype)
def _binary_op_truediv( self, rhs: BinaryOperand ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]: lhs = self # type: column.ColumnBase if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") if isinstance(rhs, cudf.Scalar): if rhs.is_valid(): rhs = rhs.value.astype(common_dtype).astype("float64") else: rhs = cudf.Scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") out_dtype = cudf.dtype("float64") elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: raise TypeError( f"Division of {self.dtype} with {rhs.dtype} " f"cannot be performed." ) return lhs, rhs, out_dtype
def __init__( self, data: Buffer, dtype: DtypeObj, mask: Buffer = None, size: int = None, # TODO: make this non-optional offset: int = 0, null_count: int = None, ): dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: size = data.size // dtype.itemsize size = size - offset super().__init__( data, size=size, dtype=dtype, mask=mask, offset=offset, null_count=null_count, )
def test_generic_ptx(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_col = Series(lhs_arr)._column rhs_arr = np.random.random(size).astype(dtype) rhs_col = Series(rhs_arr)._column def generic_function(a, b): return a ** 3 + b nb_type = numpy_support.from_dtype(cudf.dtype(dtype)) type_signature = (nb_type, nb_type) ptx_code, output_type = compile_ptx( generic_function, type_signature, device=True ) dtype = numpy_support.as_dtype(output_type).type out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype) result = lhs_arr ** 3 + rhs_arr np.testing.assert_almost_equal(result, out_col.to_array())
def min_column_type(x, expected_type): """ Return the smallest dtype which can represent all elements of the `NumericalColumn` `x` If the column is not a subtype of `np.signedinteger` or `np.floating` returns the same dtype as the dtype of `x` without modification """ if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") if x.valid_count == 0: return x.dtype if np.issubdtype(x.dtype, np.floating): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) elif np.issubdtype(expected_type, np.integer): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) else: result_type = x.dtype return cudf.dtype(result_type)
def __init__( self, data: Buffer, dtype: DtypeObj, mask: Buffer = None, size: int = None, # TODO: make non-optional offset: int = 0, null_count: int = None, ): dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: size = data.size // dtype.itemsize size = size - offset super().__init__( data, size=size, dtype=dtype, mask=mask, offset=offset, null_count=null_count, ) if not (self.dtype.type is np.datetime64): raise TypeError(f"{self.dtype} is not a supported datetime type") self._time_unit, _ = np.datetime_data(self.dtype)
def element_type(self) -> Dtype: if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) elif isinstance(self._typ.value_type, pa.StructType): return StructDtype.from_arrow(self._typ.value_type) else: return cudf.dtype(self._typ.value_type.to_pandas_dtype()).name
def get_values_for_nested_data(dtype, lists_max_length=None, size=None): """ Returns list of values based on dtype. """ if size is None: cardinality = np.random.randint(0, lists_max_length) else: cardinality = size dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() elif dtype.kind == "f": values = float_generator(dtype=dtype, size=cardinality)() elif dtype.kind in ("U", "O"): values = [ mimesis.random.random.schoice( string.printable, 100, ) for _ in range(cardinality) ] elif dtype.kind == "M": values = datetime_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "m": values = timedelta_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "b": values = boolean_generator(cardinality)().astype(dtype) else: raise TypeError(f"Unsupported dtype: {dtype}") return values
def binary_operator( self, op: str, rhs: Union[ColumnBase, "cudf.Scalar"], reflect: bool = False, ) -> ColumnBase: if isinstance(rhs, cudf.DateOffset): return rhs._datetime_binop(self, op, reflect=reflect) lhs: Union[ScalarLike, ColumnBase] = self if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"): out_dtype = cudf.dtype(np.bool_) # type: Dtype elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype( rhs, lhs) elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = cudf.core.column.timedelta._timedelta_sub_result_dtype( rhs if reflect else lhs, lhs if reflect else rhs) elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype): units = ["s", "ms", "us", "ns"] lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) out_dtype = np.dtype( f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]") else: raise TypeError(f"Series of dtype {self.dtype} cannot perform " f" the operation {op}") if reflect: lhs, rhs = rhs, lhs return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
def get_values_for_nested_data(dtype, lists_max_length): """ Returns list of values based on dtype. """ cardinality = np.random.randint(0, lists_max_length) dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): values = int_generator(dtype=dtype, size=cardinality)() elif dtype.kind == "f": values = float_generator(dtype=dtype, size=cardinality)() elif dtype.kind in ("U", "O"): values = [ mimesis.random.random.schoice( string.printable, 100, ) for _ in range(cardinality) ] elif dtype.kind == "M": values = datetime_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "m": values = timedelta_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "b": values = boolean_generator(cardinality)().astype(dtype) else: raise TypeError(f"Unsupported dtype: {dtype}") # To ensure numpy arrays are not passed as input to # list constructor, returning a python list object here. if isinstance(values, np.ndarray): return values.tolist() else: return values
def dtype(self): if self._is_host_value_current: if isinstance(self._host_value, str): return cudf.dtype("object") else: return self._host_dtype else: return self.device_value.dtype
def _preprocess_host_value(self, value, dtype): valid = not cudf._lib.scalar._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: raise TypeError("Lists may not be cast to a different dtype") else: dtype = ListDtype.from_arrow( pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, ListDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to ListDtype") else: return NA, dtype if isinstance(value, dict): if dtype is None: dtype = StructDtype.from_arrow( pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, StructDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to StructDType") else: return NA, dtype if isinstance(dtype, Decimal64Dtype): value = pa.scalar(value, type=pa.decimal128(dtype.precision, dtype.scale)).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = Decimal64Dtype._from_decimal(value) value = to_cudf_compatible_scalar(value, dtype=dtype) if dtype is None: if not valid: if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": raise TypeError( "Cant convert generic NaT to null scalar") else: dtype = value.dtype else: raise TypeError( "dtype required when constructing a null scalar") else: dtype = value.dtype if not isinstance(dtype, Decimal64Dtype): dtype = cudf.dtype(dtype) if not valid: value = NA return value, dtype
def test_max(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) got = sr.max() expect = dtype(data.max()) assert expect == got
def _buffer_data_from_array_interface(array_interface): ptr = array_interface["data"][0] if ptr is None: ptr = 0 itemsize = cudf.dtype(array_interface["typestr"]).itemsize shape = (array_interface["shape"] if len(array_interface["shape"]) > 0 else (1, )) size = functools.reduce(operator.mul, shape) return ptr, size * itemsize
def test_sum(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum() expect = data.sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) assert s.value is cudf.NA assert s.dtype == ( cudf.dtype(dtype) if not isinstance(dtype, cudf.core.dtypes.DecimalDtype) else dtype ) assert s.is_valid() is False
def confirm_1d_contiguous(array_interface): strides = array_interface["strides"] shape = array_interface["shape"] itemsize = cudf.dtype(array_interface["typestr"]).itemsize typestr = array_interface["typestr"] if typestr not in ("|i1", "|u1"): raise TypeError("Buffer data must be of uint8 type") if not get_c_contiguity(shape, strides, itemsize): raise ValueError("Buffer data must be 1D C-contiguous")
def test_product(dtype, nelem): np.random.seed(0) dtype = cudf.dtype(dtype).type if cudf.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[np.random.randint(low=0, high=nelem, size=1)] = ( np.random.uniform() * 2 ) else: data = gen_rand(dtype, nelem) sr = Series(data) got = sr.product() expect = pd.Series(data).product() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_series_construction_with_nulls(input_obj, dtype): dtype = cudf.dtype(dtype) input_obj = [ dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj ] expect = pd.Series(input_obj, dtype="category") got = cudf.Series(input_obj, dtype="category").to_pandas() assert_eq(expect, got)
def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) if pd.api.types.is_datetime64_dtype( dtype) or pd.api.types.is_timedelta64_dtype(dtype): time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": return dtype.type("nan") else: return np.float64("nan")
def as_string_column(self, dtype: Dtype, format=None, **kwargs) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[cudf.dtype( self.dtype)](self) else: return cast("cudf.core.column.StringColumn", as_column([], dtype="object"))