def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, box): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) boxed, box_dtype = box # read from parametrized fixture if box_dtype == object: pytest.xfail("falsely upcasts to object") elif boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): pytest.xfail("does not upcast to complex") elif boxed and (dtype, expected_dtype) in [ ("float32", "float64"), ("float32", "complex64"), ("complex64", "complex128"), ]: pytest.xfail("does not upcast correctly depending on value") # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value): dtype = np.dtype(any_numpy_dtype_reduced) if is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and fill_value is NaT: # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to dtyped-NaT expected_dtype = dtype exp_val_for_scalar = dtype.type("NaT", "ns") elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: """ Get the desired dtype of a result based on the input dtype and how it was computed. Parameters ---------- dtype : np.dtype or ExtensionDtype Input dtype. Returns ------- np.dtype or ExtensionDtype The desired dtype of the result. """ how = self.how if how in ["add", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif isinstance(dtype, (BooleanDtype, IntegerDtype)): return Int64Dtype() elif how in ["mean", "median", "var"]: if isinstance(dtype, (BooleanDtype, IntegerDtype)): return Float64Dtype() elif is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype elif is_numeric_dtype(dtype): return np.dtype(np.float64) return dtype
def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(string_dtype) boxed, box_dtype = box # read from parametrized fixture if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: pytest.xfail('does not upcast or raises') if (boxed and box_dtype in (None, 'str') and ( is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype) or issubclass(dtype.type, np.bytes_))): pytest.xfail('does not upcast correctly') # create array of given dtype fill_value = 'abc' # special case for box_dtype (cannot use fixture in parametrization) box_dtype = fill_dtype if box_dtype == 'str' else box_dtype # filling anything with a string casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value exp_val_for_array = np.nan _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array)
def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: """ Get the desired dtype of a result based on the input dtype and how it was computed. Parameters ---------- dtype : np.dtype Returns ------- np.dtype The desired dtype of the result. """ how = self.how if how in ["sum", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif how in ["mean", "median", "var"]: if is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype elif is_numeric_dtype(dtype): return np.dtype(np.float64) return dtype
def test_maybe_promote_any_numpy_dtype_with_na( any_numpy_dtype_reduced, fill_value, box ): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if ( dtype == bytes and not boxed and fill_value is not None and fill_value is not NaT ): pytest.xfail("does not upcast to object") elif is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and fill_value is NaT: # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to dtyped-NaT expected_dtype = dtype exp_val_for_scalar = dtype.type("NaT", "ns") elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan # array case has same expected_dtype; but returns corresponding na-marker if is_integer_dtype(expected_dtype): # integers cannot hold NaNs; maybe_promote_with_array returns None exp_val_for_array = None elif is_datetime_or_timedelta_dtype(expected_dtype): exp_val_for_array = expected_dtype.type("NaT", "ns") else: # expected_dtype = float / complex / object exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) assert not com.is_complex_dtype(pd.Series([1, 2])) assert not com.is_complex_dtype(np.array(["a", "b"])) assert com.is_complex_dtype(np.complex_) assert com.is_complex_dtype(complex) assert com.is_complex_dtype(np.array([1 + 1j, 5]))
def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) assert not com.is_complex_dtype(pd.Series([1, 2])) assert not com.is_complex_dtype(np.array(['a', 'b'])) assert com.is_complex_dtype(np.complex) assert com.is_complex_dtype(np.array([1 + 1j, 5]))
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, nulls_fixture): fill_value = nulls_fixture dtype = np.dtype(any_numpy_dtype_reduced) if isinstance(fill_value, Decimal): # Subject to change, but ATM (When Decimal(NAN) is being added to nulls_fixture) # this is the existing behavior in maybe_promote, # hinges on is_valid_na_for_dtype if dtype.kind in ["i", "u", "f", "c"]: if dtype.kind in ["i", "u"]: expected_dtype = np.dtype(np.float64) else: expected_dtype = dtype exp_val_for_scalar = np.nan else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and fill_value is NaT: # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to dtyped-NaT expected_dtype = dtype exp_val_for_scalar = dtype.type("NaT", "ns") elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) if fill_value is pd.NA: exp_val_for_scalar = pd.NA else: exp_val_for_scalar = np.nan _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
def test_maybe_promote_any_with_timedelta64(any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if is_timedelta64_dtype(dtype): if boxed and (box_dtype == object or (box_dtype is None and not is_timedelta64_dtype(type(fill_value)))): pytest.xfail("falsely upcasts to object") else: if boxed and box_dtype is None and is_timedelta64_dtype( type(fill_value)): pytest.xfail("does not upcast correctly") if (not boxed and is_timedelta64_dtype(type(fill_value)) and (is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype) or issubclass(dtype.type, np.bytes_))): pytest.xfail("does not upcast correctly") if box_dtype == "td_dtype": pytest.xfail("falsely upcasts") if not boxed and is_datetime64_dtype(dtype): pytest.xfail("raises error") # special case for box_dtype box_dtype = np.dtype( timedelta64_dtype) if box_dtype == "td_dtype" else box_dtype # filling anything but timedelta with timedelta casts to object if is_timedelta64_dtype(dtype): expected_dtype = dtype # for timedelta dtypes, scalar values get cast to pd.Timedelta.value exp_val_for_scalar = pd.Timedelta(fill_value).value exp_val_for_array = iNaT else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def find_common_type(types): """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes Returns ------- pandas extension or numpy dtype See Also -------- numpy.find_common_type """ if len(types) == 0: raise ValueError("no types given") first = types[0] # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) # => object if all(is_dtype_equal(first, t) for t in types[1:]): return first if any(isinstance(t, ExtensionDtype) for t in types): return np.object # take lowest unit if all(is_datetime64_dtype(t) for t in types): return np.dtype("datetime64[ns]") if all(is_timedelta64_dtype(t) for t in types): return np.dtype("timedelta64[ns]") # don't mix bool / int or float or complex # this is different from numpy, which casts bool with float/int as int has_bools = any(is_bool_dtype(t) for t in types) if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): return np.object return np.find_common_type(types, [])
def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, box): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) boxed, box_dtype = box # read from parametrized fixture if box_dtype == object: pytest.xfail("falsely upcasts to object") if boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): pytest.xfail("does not upcast to complex") if (dtype, expected_dtype) in [ ("float32", "float64"), ("float32", "complex64"), ("complex64", "complex128"), ]: pytest.xfail("does not upcast correctly depending on value") # this following xfails are "only" a consequence of the - now strictly # enforced - principle that maybe_promote_with_scalar always casts if not boxed and abs(fill_value) < 2: pytest.xfail("wrong return type of fill_value") if ( not boxed and dtype == "complex128" and expected_dtype == "complex128" and is_float_dtype(type(fill_value)) ): pytest.xfail("wrong return type of fill_value") # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(bytes_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if issubclass(fill_dtype.type, np.bytes_): if not boxed or box_dtype == object: pytest.xfail("falsely upcasts to object") # takes the opinion that bool dtype has no missing value marker else: pytest.xfail("wrong missing value marker") else: if boxed and box_dtype is None: pytest.xfail("does not upcast to object") if (is_integer_dtype(fill_dtype) or is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype) or is_object_dtype(fill_dtype) or is_timedelta64_dtype(fill_dtype)) and not boxed: pytest.xfail("does not upcast to object") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] # filling bytes with anything but bytes casts to object expected_dtype = (dtype if issubclass(fill_dtype.type, np.bytes_) else np.dtype(object)) exp_val_for_scalar = fill_value exp_val_for_array = None if issubclass(fill_dtype.type, np.bytes_) else np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function( kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '{kind}{itemsize}'.format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def array_equivalent( left, right, strict_nan: bool = False, dtype_equal: bool = False, ) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. dtype_equal : bool, default False Whether `left` and `right` are known to have the same dtype according to `is_dtype_equal`. Some methods like `BlockManager.equals`. require that the dtypes match. Setting this to ``True`` can improve performance, but will give different results for arrays that are equal but different dtypes. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False if dtype_equal: # fastpath when we require that the dtypes match (Block.equals) if left.dtype.kind in ["f", "c"]: return _array_equivalent_float(left, right) elif is_datetimelike_v_numeric(left.dtype, right.dtype): return False elif needs_i8_conversion(left.dtype): return _array_equivalent_datetimelike(left, right) elif is_string_or_object_np_dtype(left.dtype): # TODO: fastpath for pandas' StringDtype return _array_equivalent_object(left, right, strict_nan) else: return np.array_equal(left, right) # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if left.dtype.kind in "OSU" or right.dtype.kind in "OSU": # Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]` # or `in ("O", "S", "U")` return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): if not (left.size and right.size): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if (left.dtype.type is np.void or right.dtype.type is np.void) and left.dtype != right.dtype: return False return np.array_equal(left, right)
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left) or is_string_dtype(right): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( ensure_object(left.ravel()), ensure_object(right.ravel()) ) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif left_value is libmissing.NA and right_value is not libmissing.NA: return False elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False else: try: if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False elif "boolean value of NA is ambiguous" in str(err): return False raise return True # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False return np.array_equal(left, right)
def _cython_operation( self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan elif ( how == "add" and is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) ): # We need this to ensure that Series[Int64Dtype].resample().sum() # remains int64 dtype. # Two options for avoiding this special case # 1. mask-aware ops and avoid casting to float with NaN above # 2. specify the result dtype when calling this method result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype ): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. Parameters ---------- dtype : np.dtype or ExtensionDtype fill_value : scalar, default np.nan Returns ------- dtype Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like raise ValueError("fill_value must be a scalar") # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = fill_value.dtype.type("NaT", "ns") else: # we need to change to object type as our # fill_value is of object type if fill_value.dtype == np.object_: dtype = np.dtype(np.object_) fill_value = np.nan if dtype == np.object_ or dtype.kind in ["U", "S"]: # We treat string-like dtypes as object, and _always_ fill # with np.nan fill_value = np.nan dtype = np.dtype(np.object_) # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): if (is_integer(fill_value) or (is_float(fill_value) and not np.isnan(fill_value)) or isinstance(fill_value, str)): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: try: fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: if fv is NaT: # NaT has no `to_timedelta64` method fill_value = np.timedelta64("NaT", "ns") else: fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT elif not isinstance(fill_value, datetime): dtype = np.dtype(np.object_) elif fill_value.tzinfo is None: dtype = np.dtype(np.object_) elif not tz_compare(fill_value.tzinfo, dtype.tz): # TODO: sure we want to cast here? dtype = np.dtype(np.object_) elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) elif dtype.kind == "f": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.float64 and dtype is np.float32 dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): dtype = np.float64 fill_value = np.nan elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) fill_value = np.nan else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(values, how) if is_extension_array_dtype(values.dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(ensure_float(values)) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _cython_operation(self, kind: str, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{dtype} dtype not supported".format(dtype=values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {how} operations".format( how=how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {how} operations". format(how=how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() # type: Optional[List[str]] else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: np.ndarray | None = None, **kwargs, ) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) func_uses_mask = cy_op.uses_mask() if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: return self._masked_ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) else: return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, mask=mask, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype = dtype.kind + "8" values = values.astype(dtype, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count if func_uses_mask: func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in cy_op.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if (dtype == bytes and not boxed and fill_value is not None and fill_value is not NaT): pytest.xfail('does not upcast to object') elif dtype == 'uint64' and not boxed and fill_value == iNaT: pytest.xfail('does not upcast correctly') elif is_datetime_or_timedelta_dtype(dtype) and boxed: pytest.xfail('falsely upcasts to object') elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype)) and fill_value is not NaT and dtype != 'uint64'): pytest.xfail('falsely upcasts to object') elif (boxed and dtype == 'uint64' and (fill_value is np.nan or fill_value is None)): pytest.xfail('falsely upcasts to object') # below: opinionated that iNaT should be interpreted as missing value elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) and fill_value == iNaT): pytest.xfail('does not cast to missing value marker correctly') elif ((is_string_dtype(dtype) or dtype == bool) and not boxed and fill_value == iNaT): pytest.xfail('does not cast to missing value marker correctly') if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan elif is_integer_dtype(dtype) and fill_value == iNaT: # other integer + iNaT casts to int64 expected_dtype = np.int64 exp_val_for_scalar = iNaT elif is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to iNaT expected_dtype = dtype exp_val_for_scalar = iNaT elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan # array case has same expected_dtype; but returns corresponding na-marker if is_integer_dtype(expected_dtype): # integers cannot hold NaNs; maybe_promote_with_array returns None exp_val_for_array = None elif is_datetime_or_timedelta_dtype(expected_dtype): exp_val_for_array = iNaT else: # expected_dtype = float / complex / object exp_val_for_array = np.nan _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array)
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # GH#43329 If the dtype is explicitly of type uint64 the type is not # changed to prevent overflow. if dtype != np.uint64: values = values.astype(np.int64, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean"]: func( result, counts, values, comp_ids, min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) elif self.how in ["add"]: # We support datetimelike func( result, counts, values, comp_ids, min_count, datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = self.get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self.get_result_dtype(orig_values.dtype) # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" op_result = maybe_downcast_to_dtype( result, res_dtype # type: ignore[arg-type] ) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]