Ejemplo n.º 1
0
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        result_mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # GH#43329 If the dtype is explicitly of type uint64 the type is not
            # changed to prevent overflow.
            if dtype != np.uint64:
                values = values.astype(np.int64, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T
        if mask is not None:
            mask = mask.T
            if result_mask is not None:
                result_mask = result_mask.T

        out_shape = self._get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max", "mean"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    mask=mask,
                    result_mask=result_mask,
                    is_datetimelike=is_datetimelike,
                )
            elif self.how in ["add"]:
                # We support datetimelike
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self._get_result_dtype(orig_values.dtype)
            op_result = maybe_downcast_to_dtype(result, res_dtype)
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]
Ejemplo n.º 2
0
    def _cython_operation(
        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
    ) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        cy_op = WrappedCythonOp(kind=kind, how=how)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        cy_op.disallow_invalid_ops(dtype, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(
                kind, values, how, axis, min_count, **kwargs
            )

        elif values.ndim == 1:
            # expand to 2d, dispatch, then squeeze if appropriate
            values2d = values[None, :]
            res = self._cython_operation(
                kind=kind,
                values=values2d,
                how=how,
                axis=1,
                min_count=min_count,
                **kwargs,
            )
            if res.shape[0] == 1:
                return res[0]

            # otherwise we have OHLC
            return res.T

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(dtype):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        ngroups = self.ngroups
        comp_ids, _, _ = self.group_info

        assert axis == 1
        values = values.T

        out_shape = cy_op.get_output_shape(ngroups, values)
        func, values = cy_op.get_cython_func_and_vals(values, is_numeric)
        out_dtype = cy_op.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            func(result, counts, values, comp_ids, min_count)
        elif kind == "transform":
            # TODO: min_count
            func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if is_integer_dtype(result.dtype) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        result = result.T

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
Ejemplo n.º 3
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(dtype, how, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(dtype):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(dtype):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = maybe_fill(np.empty(values.shape, dtype=out_dtype))

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result.dtype) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if swapped:
            result = result.swapaxes(0, axis)

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
Ejemplo n.º 4
0
    def _cython_operation(
        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
    ) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values.dtype):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations"
                )
        elif is_timedelta64_dtype(values.dtype):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations"
                )

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?
            # TODO(EA2D):kludge can be avoided when 2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(ensure_float(values))
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups,) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func, min_count)
        elif kind == "transform":
            result = maybe_fill(
                np.empty_like(values, dtype=out_dtype), fill_value=np.nan
            )

            # TODO: min_count
            result = self._transform(
                result, values, codes, func, is_datetimelike, **kwargs
            )

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
            orig_values.dtype
        ):
            # We need to use the constructors directly for these dtypes
            # since numpy won't recognize them
            # https://github.com/pandas-dev/pandas/issues/31471
            result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        if is_extension_array_dtype(orig_values.dtype):
            result = maybe_cast_result(result=result, obj=orig_values, how=how)

        return result, names
Ejemplo n.º 5
0
    def _cython_operation(
        self,
        kind: str,
        values,
        how: str,
        axis: int,
        min_count: int = -1,
        mask: np.ndarray | None = None,
        **kwargs,
    ) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        cy_op = WrappedCythonOp(kind=kind, how=how)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        cy_op.disallow_invalid_ops(dtype, is_numeric)

        func_uses_mask = cy_op.uses_mask()
        if is_extension_array_dtype(dtype):
            if isinstance(values, BaseMaskedArray) and func_uses_mask:
                return self._masked_ea_wrap_cython_operation(
                    kind, values, how, axis, min_count, **kwargs
                )
            else:
                return self._ea_wrap_cython_operation(
                    kind, values, how, axis, min_count, **kwargs
                )

        elif values.ndim == 1:
            # expand to 2d, dispatch, then squeeze if appropriate
            values2d = values[None, :]
            res = self._cython_operation(
                kind=kind,
                values=values2d,
                how=how,
                axis=1,
                min_count=min_count,
                mask=mask,
                **kwargs,
            )
            if res.shape[0] == 1:
                return res[0]

            # otherwise we have OHLC
            return res.T

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype = dtype.kind + "8"
            values = values.astype(dtype, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        ngroups = self.ngroups
        comp_ids, _, _ = self.group_info

        assert axis == 1
        values = values.T

        if mask is not None:
            mask = mask.reshape(values.shape, order="C")

        out_shape = cy_op.get_output_shape(ngroups, values)
        func, values = cy_op.get_cython_func_and_vals(values, is_numeric)
        out_dtype = cy_op.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        elif kind == "transform":
            # TODO: min_count
            if func_uses_mask:
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

            if self._filter_empty_groups and not counts.all():
                assert result.ndim != 2
                result = result[counts > 0]

        result = result.T

        if how not in cy_op.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
Ejemplo n.º 6
0
Archivo: ops.py Proyecto: tnir/pandas
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: npt.NDArray[np.bool_] | None,
        result_mask: npt.NDArray[np.bool_] | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.view("uint8")
        if values.dtype == "float16":
            values = values.astype(np.float32)

        values = values.T
        if mask is not None:
            mask = mask.T
            if result_mask is not None:
                result_mask = result_mask.T

        out_shape = self._get_output_shape(ngroups, values)
        func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
        values = self._get_cython_vals(values)
        out_dtype = self._get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max", "mean", "last", "first"]:
                func(
                    out=result,
                    counts=counts,
                    values=values,
                    labels=comp_ids,
                    min_count=min_count,
                    mask=mask,
                    result_mask=result_mask,
                    is_datetimelike=is_datetimelike,
                )
            elif self.how in ["sum"]:
                # We support datetimelike
                func(
                    out=result,
                    counts=counts,
                    values=values,
                    labels=comp_ids,
                    min_count=min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                if self.how != "rank":
                    # TODO: should rank take result_mask?
                    kwargs["result_mask"] = result_mask
                func(
                    out=result,
                    values=values,
                    labels=comp_ids,
                    ngroups=ngroups,
                    is_datetimelike=is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(
                    out=result,
                    values=values,
                    labels=comp_ids,
                    ngroups=ngroups,
                    is_datetimelike=is_datetimelike,
                    **kwargs,
                )

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    if result_mask is not None and self.uses_mask():
                        assert result_mask[empty_groups].all()
                    else:
                        # Note: this conversion could be lossy, see GH#40767
                        result = result.astype("float64")
                        result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            # Casting only needed for float16, bool, datetimelike,
            #  and self.how in ["sum", "prod", "ohlc", "cumprod"]
            res_dtype = self._get_result_dtype(orig_values.dtype)
            op_result = maybe_downcast_to_dtype(result, res_dtype)
        else:
            op_result = result

        return op_result
Ejemplo n.º 7
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(values, how)

        if is_extension_array_dtype(values.dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(ensure_float(values))
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = maybe_fill(np.empty_like(values, dtype=out_dtype),
                                fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
Ejemplo n.º 8
0
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype_str = dtype.kind + "8"
            values = values.astype(dtype_str, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T

        if mask is not None:
            mask = mask.reshape(values.shape, order="C")

        out_shape = self.get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike,
                     **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self.get_result_dtype(orig_values.dtype)
            # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type
            # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]"
            op_result = maybe_downcast_to_dtype(
                result,
                res_dtype  # type: ignore[arg-type]
            )
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]