Exemple #1
0
    def _aggregate_series_pure_python(self, obj: Series, func: F):
        group_index, _, ngroups = self.group_info

        counts = np.zeros(ngroups, dtype=int)
        result = np.empty(ngroups, dtype="O")
        initialized = False

        splitter = get_splitter(obj, group_index, ngroups, axis=0)

        for label, group in splitter:

            # Each step of this loop corresponds to
            #  libreduction._BaseGrouper._apply_to_group
            res = func(group)
            res = libreduction.extract_result(res)

            if not initialized:
                # We only do this validation on the first iteration
                libreduction.check_result_array(res, 0)
                initialized = True

            counts[label] = group.shape[0]
            result[label] = res

        out = lib.maybe_convert_objects(result, try_float=False)
        out = maybe_cast_result(out, obj, numeric_only=True)

        return out, counts
Exemple #2
0
    def _aggregate_series_pure_python(self, obj: Series, func: F):
        group_index, _, ngroups = self.group_info

        counts = np.zeros(ngroups, dtype=int)
        result = np.empty(ngroups, dtype="O")
        initialized = False

        splitter = get_splitter(obj, group_index, ngroups, axis=0)

        for label, group in splitter:

            # Each step of this loop corresponds to
            #  libreduction._BaseGrouper._apply_to_group
            res = func(group)
            res = libreduction.extract_result(res)

            if not initialized:
                # We only do this validation on the first iteration
                libreduction.check_result_array(res, 0)
                initialized = True

            counts[label] = group.shape[0]
            result[label] = res

        result = lib.maybe_convert_objects(result, try_float=False)
        # error: Incompatible types in assignment (expression has type
        # "Union[ExtensionArray, ndarray]", variable has type "ndarray")
        result = maybe_cast_result(  # type: ignore[assignment]
            result, obj, numeric_only=True)

        return result, counts
Exemple #3
0
    def _ea_wrap_cython_operation(
            self,
            kind: str,
            values,
            how: str,
            axis: int,
            min_count: int = -1,
            **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        If we have an ExtensionArray, unwrap, call _cython_operation, and
        re-wrap if appropriate.
        """
        # TODO: general case implementation overrideable by EAs.
        orig_values = values

        if is_datetime64tz_dtype(values.dtype) or is_period_dtype(
                values.dtype):
            # All of the functions implemented here are ordinal, so we can
            #  operate on the tz-naive equivalents
            values = values.view("M8[ns]")
            res_values = self._cython_operation(kind, values, how, axis,
                                                min_count, **kwargs)
            if how in ["rank"]:
                # preserve float64 dtype
                return res_values

            res_values = res_values.astype("i8", copy=False)
            result = type(orig_values)._simple_new(res_values,
                                                   dtype=orig_values.dtype)
            return result

        elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
            # IntegerArray or BooleanArray
            values = ensure_int_or_float(values)
            res_values = self._cython_operation(kind, values, how, axis,
                                                min_count, **kwargs)
            result = maybe_cast_result(result=res_values,
                                       obj=orig_values,
                                       how=how)
            return result

        raise NotImplementedError(values.dtype)
Exemple #4
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values.dtype):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(values.dtype):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations")

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?
            # TODO(EA2D):kludge can be avoided when 2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
                orig_values.dtype):
            # We need to use the constructors directly for these dtypes
            # since numpy won't recognize them
            # https://github.com/pandas-dev/pandas/issues/31471
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        if is_extension_array_dtype(orig_values.dtype):
            result = maybe_cast_result(result=result, obj=orig_values, how=how)

        return result, names