Exemple #1
0
def sanitize_index(data, index: Index):
    """
    Sanitize an index type to return an ndarray of the underlying, pass
    through a non-Index.
    """
    if len(data) != len(index):
        raise ValueError("Length of values "
                         f"({len(data)}) "
                         "does not match length of index "
                         f"({len(index)})")

    if isinstance(data, np.ndarray):

        # coerce datetimelike types to ns
        data = sanitize_to_nanoseconds(data)

    return data
Exemple #2
0
    def __init__(
        self,
        index: Index,
        grouper=None,
        obj: FrameOrSeries | None = None,
        level=None,
        sort: bool = True,
        observed: bool = False,
        in_axis: bool = False,
        dropna: bool = True,
    ):
        self.level = level
        self._orig_grouper = grouper
        self.grouper = _convert_grouper(index, grouper)
        self.all_grouper = None
        self.index = index
        self.sort = sort
        self.obj = obj
        self.observed = observed
        self.in_axis = in_axis
        self.dropna = dropna

        self._passed_categorical = False

        # we have a single grouper which may be a myriad of things,
        # some of which are dependent on the passing in level

        ilevel = self._ilevel
        if ilevel is not None:
            (
                self.grouper,  # Index
                self._codes,
                self._group_index,
            ) = index._get_grouper_for_level(self.grouper, ilevel)

        # a passed Grouper like, directly get the grouper in the same way
        # as single grouper groupby, use the group_info to get codes
        elif isinstance(self.grouper, Grouper):
            # get the new grouper; we already have disambiguated
            # what key/level refer to exactly, don't need to
            # check again as we have by this point converted these
            # to an actual value (rather than a pd.Grouper)
            _, newgrouper, newobj = self.grouper._get_grouper(
                # error: Value of type variable "FrameOrSeries" of "_get_grouper"
                # of "Grouper" cannot be "Optional[FrameOrSeries]"
                self.obj,  # type: ignore[type-var]
                validate=False,
            )
            self.obj = newobj

            ng = newgrouper._get_grouper()
            if isinstance(newgrouper, ops.BinGrouper):
                # in this case we have `ng is newgrouper`
                self.grouper = ng
            else:
                # ops.BaseGrouper
                # use Index instead of ndarray so we can recover the name
                self.grouper = Index(ng, name=newgrouper.result_index.name)

        elif is_categorical_dtype(self.grouper):
            # a passed Categorical
            self._passed_categorical = True

            self.grouper, self.all_grouper = recode_for_groupby(
                self.grouper, self.sort, observed
            )

        elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)):
            # no level passed
            if getattr(self.grouper, "ndim", 1) != 1:
                t = self.name or str(type(self.grouper))
                raise ValueError(f"Grouper for '{t}' not 1-dimensional")

            self.grouper = self.index.map(self.grouper)

            if not (
                hasattr(self.grouper, "__len__")
                and len(self.grouper) == len(self.index)
            ):
                grper = pprint_thing(self.grouper)
                errmsg = (
                    "Grouper result violates len(labels) == "
                    f"len(data)\nresult: {grper}"
                )
                self.grouper = None  # Try for sanity
                raise AssertionError(errmsg)

        if isinstance(self.grouper, np.ndarray):
            # if we have a date/time-like grouper, make sure that we have
            # Timestamps like
            self.grouper = sanitize_to_nanoseconds(self.grouper)
Exemple #3
0
def _try_cast(
    arr: list | np.ndarray,
    dtype: DtypeObj | None,
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    is_ndarray = isinstance(arr, np.ndarray)

    if dtype is None:
        # perf shortcut as this is the most common case
        if is_ndarray:
            arr = cast(np.ndarray, arr)
            if arr.dtype != object:
                return sanitize_to_nanoseconds(arr, copy=copy)

            out = maybe_infer_to_datetimelike(arr)
            if out is arr and copy:
                out = out.copy()
            return out

        else:
            # i.e. list
            varr = np.array(arr, copy=False)
            # filter out cases that we _dont_ want to go through
            #  maybe_infer_to_datetimelike
            if varr.dtype != object or varr.size == 0:
                return varr
            return maybe_infer_to_datetimelike(varr)

    elif isinstance(dtype, ExtensionDtype):
        # create an extension array from its dtype
        if isinstance(dtype, DatetimeTZDtype):
            # We can't go through _from_sequence because it handles dt64naive
            #  data differently; _from_sequence treats naive as wall times,
            #  while maybe_cast_to_datetime treats it as UTC
            #  see test_maybe_promote_any_numpy_dtype_with_datetimetz
            # TODO(2.0): with deprecations enforced, should be able to remove
            #  special case.
            return maybe_cast_to_datetime(arr, dtype)
            # TODO: copy?

        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    elif is_object_dtype(dtype):
        if not is_ndarray:
            subarr = construct_1d_object_array_from_listlike(arr)
            return subarr
        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)

    elif dtype.kind == "U":
        # TODO: test cases with arr.dtype.kind in ["m", "M"]
        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)

    elif dtype.kind in ["m", "M"]:
        return maybe_cast_to_datetime(arr, dtype)

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            subarr = maybe_cast_to_integer_array(arr, dtype)
        else:
            # 4 tests fail if we move this to a try/except/else; see
            #  test_constructor_compound_dtypes, test_constructor_cast_failure
            #  test_constructor_dict_cast2, test_loc_setitem_dtype
            subarr = np.array(arr, dtype=dtype, copy=copy)

    except (ValueError, TypeError):
        if raise_cast_failure:
            raise
        else:
            # we only get here with raise_cast_failure False, which means
            #  called via the DataFrame constructor
            # GH#24435
            warnings.warn(
                f"Could not cast to {dtype}, falling back to object. This "
                "behavior is deprecated. In a future version, when a dtype is "
                "passed to 'DataFrame', either all columns will be cast to that "
                "dtype, or a TypeError will be raised.",
                FutureWarning,
                stacklevel=find_stack_level(),
            )
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Exemple #4
0
def _try_cast(
    arr: list | np.ndarray,
    dtype: DtypeObj | None,
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    is_ndarray = isinstance(arr, np.ndarray)

    # perf shortcut as this is the most common case
    # Item "List[Any]" of "Union[List[Any], ndarray]" has no attribute "dtype"
    if (is_ndarray and arr.dtype != object  # type: ignore[union-attr]
            and not copy and dtype is None):
        # Argument 1 to "sanitize_to_nanoseconds" has incompatible type
        # "Union[List[Any], ndarray]"; expected "ndarray"
        return sanitize_to_nanoseconds(arr)  # type: ignore[arg-type]

    if isinstance(dtype, ExtensionDtype):
        # create an extension array from its dtype
        # DatetimeTZ case needs to go through maybe_cast_to_datetime but
        # SparseDtype does not
        if isinstance(dtype, DatetimeTZDtype):
            # We can't go through _from_sequence because it handles dt64naive
            #  data differently; _from_sequence treats naive as wall times,
            #  while maybe_cast_to_datetime treats it as UTC
            #  see test_maybe_promote_any_numpy_dtype_with_datetimetz

            # error: Incompatible return value type (got "Union[ExtensionArray,
            # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]")
            return maybe_cast_to_datetime(arr,
                                          dtype)  # type: ignore[return-value]
            # TODO: copy?

        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    elif is_object_dtype(dtype):
        if not is_ndarray:
            subarr = construct_1d_object_array_from_listlike(arr)
            return subarr
        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)

    elif dtype is None and not is_ndarray:
        # filter out cases that we _dont_ want to go through maybe_cast_to_datetime
        varr = np.array(arr, copy=False)
        if varr.dtype != object or varr.size == 0:
            return varr
        # error: Incompatible return value type (got "Union[ExtensionArray,
        # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]")
        return maybe_cast_to_datetime(varr, None)  # type: ignore[return-value]

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            dtype = cast(np.dtype, dtype)
            maybe_cast_to_integer_array(arr, dtype)
            subarr = arr
        else:
            subarr = maybe_cast_to_datetime(arr, dtype)
            if dtype is not None and dtype.kind == "M":
                return subarr

        if not isinstance(subarr, ABCExtensionArray):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError) as err:
        if dtype is not None and raise_cast_failure:
            raise
        elif "Cannot cast" in str(err):
            # via _disallow_mismatched_datetimelike
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Exemple #5
0
    def __init__(
        self,
        index: Index,
        grouper=None,
        obj: FrameOrSeries | None = None,
        name: Hashable = None,
        level=None,
        sort: bool = True,
        observed: bool = False,
        in_axis: bool = False,
        dropna: bool = True,
    ):
        self.name = name
        self.level = level
        self.grouper = _convert_grouper(index, grouper)
        self.all_grouper = None
        self.index = index
        self.sort = sort
        self.obj = obj
        self.observed = observed
        self.in_axis = in_axis
        self.dropna = dropna

        # right place for this?
        if isinstance(grouper, (Series, Index)) and name is None:
            self.name = grouper.name

        # we have a single grouper which may be a myriad of things,
        # some of which are dependent on the passing in level

        if level is not None:
            if not isinstance(level, int):
                if level not in index.names:
                    raise AssertionError(f"Level {level} not in index")
                level = index.names.index(level)

            if self.name is None:
                self.name = index.names[level]

            (
                self.grouper,
                self._codes,
                self._group_index,
            ) = index._get_grouper_for_level(self.grouper, level)

        # a passed Grouper like, directly get the grouper in the same way
        # as single grouper groupby, use the group_info to get codes
        elif isinstance(self.grouper, Grouper):
            # get the new grouper; we already have disambiguated
            # what key/level refer to exactly, don't need to
            # check again as we have by this point converted these
            # to an actual value (rather than a pd.Grouper)
            _, grouper, _ = self.grouper._get_grouper(
                # error: Value of type variable "FrameOrSeries" of "_get_grouper"
                # of "Grouper" cannot be "Optional[FrameOrSeries]"
                self.obj,  # type: ignore[type-var]
                validate=False,
            )
            if self.name is None:
                self.name = grouper.result_index.name
            self.obj = self.grouper.obj
            self.grouper = grouper._get_grouper()

        else:

            # a passed Categorical
            if is_categorical_dtype(self.grouper):

                self.grouper, self.all_grouper = recode_for_groupby(
                    self.grouper, self.sort, observed)
                categories = self.grouper.categories

                # we make a CategoricalIndex out of the cat grouper
                # preserving the categories / ordered attributes
                self._codes = self.grouper.codes
                if observed:
                    codes = algorithms.unique1d(self.grouper.codes)
                    codes = codes[codes != -1]
                    if sort or self.grouper.ordered:
                        codes = np.sort(codes)
                else:
                    codes = np.arange(len(categories))

                self._group_index = CategoricalIndex(
                    Categorical.from_codes(codes=codes,
                                           categories=categories,
                                           ordered=self.grouper.ordered),
                    name=self.name,
                )

            # we are done
            elif isinstance(self.grouper, Grouping):
                self.grouper = self.grouper.grouper

            # no level passed
            elif not isinstance(self.grouper,
                                (Series, Index, ExtensionArray, np.ndarray)):
                if getattr(self.grouper, "ndim", 1) != 1:
                    t = self.name or str(type(self.grouper))
                    raise ValueError(f"Grouper for '{t}' not 1-dimensional")
                self.grouper = self.index.map(self.grouper)
                if not (hasattr(self.grouper, "__len__")
                        and len(self.grouper) == len(self.index)):
                    grper = pprint_thing(self.grouper)
                    errmsg = ("Grouper result violates len(labels) == "
                              f"len(data)\nresult: {grper}")
                    self.grouper = None  # Try for sanity
                    raise AssertionError(errmsg)

        if isinstance(self.grouper, np.ndarray):
            # if we have a date/time-like grouper, make sure that we have
            # Timestamps like
            self.grouper = sanitize_to_nanoseconds(self.grouper)