def sanitize_index(data, index: Index): """ Sanitize an index type to return an ndarray of the underlying, pass through a non-Index. """ if len(data) != len(index): raise ValueError("Length of values " f"({len(data)}) " "does not match length of index " f"({len(index)})") if isinstance(data, np.ndarray): # coerce datetimelike types to ns data = sanitize_to_nanoseconds(data) return data
def __init__( self, index: Index, grouper=None, obj: FrameOrSeries | None = None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.level = level self._orig_grouper = grouper self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna self._passed_categorical = False # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level ilevel = self._ilevel if ilevel is not None: ( self.grouper, # Index self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, newgrouper, newobj = self.grouper._get_grouper( # error: Value of type variable "FrameOrSeries" of "_get_grouper" # of "Grouper" cannot be "Optional[FrameOrSeries]" self.obj, # type: ignore[type-var] validate=False, ) self.obj = newobj ng = newgrouper._get_grouper() if isinstance(newgrouper, ops.BinGrouper): # in this case we have `ng is newgrouper` self.grouper = ng else: # ops.BaseGrouper # use Index instead of ndarray so we can recover the name self.grouper = Index(ng, name=newgrouper.result_index.name) elif is_categorical_dtype(self.grouper): # a passed Categorical self._passed_categorical = True self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed ) elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): # no level passed if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index) ): grper = pprint_thing(self.grouper) errmsg = ( "Grouper result violates len(labels) == " f"len(data)\nresult: {grper}" ) self.grouper = None # Try for sanity raise AssertionError(errmsg) if isinstance(self.grouper, np.ndarray): # if we have a date/time-like grouper, make sure that we have # Timestamps like self.grouper = sanitize_to_nanoseconds(self.grouper)
def _try_cast( arr: list | np.ndarray, dtype: DtypeObj | None, copy: bool, raise_cast_failure: bool, ) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray or list Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. Returns ------- np.ndarray or ExtensionArray """ is_ndarray = isinstance(arr, np.ndarray) if dtype is None: # perf shortcut as this is the most common case if is_ndarray: arr = cast(np.ndarray, arr) if arr.dtype != object: return sanitize_to_nanoseconds(arr, copy=copy) out = maybe_infer_to_datetimelike(arr) if out is arr and copy: out = out.copy() return out else: # i.e. list varr = np.array(arr, copy=False) # filter out cases that we _dont_ want to go through # maybe_infer_to_datetimelike if varr.dtype != object or varr.size == 0: return varr return maybe_infer_to_datetimelike(varr) elif isinstance(dtype, ExtensionDtype): # create an extension array from its dtype if isinstance(dtype, DatetimeTZDtype): # We can't go through _from_sequence because it handles dt64naive # data differently; _from_sequence treats naive as wall times, # while maybe_cast_to_datetime treats it as UTC # see test_maybe_promote_any_numpy_dtype_with_datetimetz # TODO(2.0): with deprecations enforced, should be able to remove # special case. return maybe_cast_to_datetime(arr, dtype) # TODO: copy? array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr elif is_object_dtype(dtype): if not is_ndarray: subarr = construct_1d_object_array_from_listlike(arr) return subarr return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy) elif dtype.kind == "U": # TODO: test cases with arr.dtype.kind in ["m", "M"] return lib.ensure_string_array(arr, convert_na_value=False, copy=copy) elif dtype.kind in ["m", "M"]: return maybe_cast_to_datetime(arr, dtype) try: # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) else: # 4 tests fail if we move this to a try/except/else; see # test_constructor_compound_dtypes, test_constructor_cast_failure # test_constructor_dict_cast2, test_loc_setitem_dtype subarr = np.array(arr, dtype=dtype, copy=copy) except (ValueError, TypeError): if raise_cast_failure: raise else: # we only get here with raise_cast_failure False, which means # called via the DataFrame constructor # GH#24435 warnings.warn( f"Could not cast to {dtype}, falling back to object. This " "behavior is deprecated. In a future version, when a dtype is " "passed to 'DataFrame', either all columns will be cast to that " "dtype, or a TypeError will be raised.", FutureWarning, stacklevel=find_stack_level(), ) subarr = np.array(arr, dtype=object, copy=copy) return subarr
def _try_cast( arr: list | np.ndarray, dtype: DtypeObj | None, copy: bool, raise_cast_failure: bool, ) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray or list Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. Returns ------- np.ndarray or ExtensionArray """ is_ndarray = isinstance(arr, np.ndarray) # perf shortcut as this is the most common case # Item "List[Any]" of "Union[List[Any], ndarray]" has no attribute "dtype" if (is_ndarray and arr.dtype != object # type: ignore[union-attr] and not copy and dtype is None): # Argument 1 to "sanitize_to_nanoseconds" has incompatible type # "Union[List[Any], ndarray]"; expected "ndarray" return sanitize_to_nanoseconds(arr) # type: ignore[arg-type] if isinstance(dtype, ExtensionDtype): # create an extension array from its dtype # DatetimeTZ case needs to go through maybe_cast_to_datetime but # SparseDtype does not if isinstance(dtype, DatetimeTZDtype): # We can't go through _from_sequence because it handles dt64naive # data differently; _from_sequence treats naive as wall times, # while maybe_cast_to_datetime treats it as UTC # see test_maybe_promote_any_numpy_dtype_with_datetimetz # error: Incompatible return value type (got "Union[ExtensionArray, # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]") return maybe_cast_to_datetime(arr, dtype) # type: ignore[return-value] # TODO: copy? array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr elif is_object_dtype(dtype): if not is_ndarray: subarr = construct_1d_object_array_from_listlike(arr) return subarr return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy) elif dtype is None and not is_ndarray: # filter out cases that we _dont_ want to go through maybe_cast_to_datetime varr = np.array(arr, copy=False) if varr.dtype != object or varr.size == 0: return varr # error: Incompatible return value type (got "Union[ExtensionArray, # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]") return maybe_cast_to_datetime(varr, None) # type: ignore[return-value] try: # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): # this will raise if we have e.g. floats dtype = cast(np.dtype, dtype) maybe_cast_to_integer_array(arr, dtype) subarr = arr else: subarr = maybe_cast_to_datetime(arr, dtype) if dtype is not None and dtype.kind == "M": return subarr if not isinstance(subarr, ABCExtensionArray): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise except (ValueError, TypeError) as err: if dtype is not None and raise_cast_failure: raise elif "Cannot cast" in str(err): # via _disallow_mismatched_datetimelike raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def __init__( self, index: Index, grouper=None, obj: FrameOrSeries | None = None, name: Hashable = None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] ( self.grouper, self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper( # error: Value of type variable "FrameOrSeries" of "_get_grouper" # of "Grouper" cannot be "Optional[FrameOrSeries]" self.obj, # type: ignore[type-var] validate=False, ) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: # a passed Categorical if is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered), name=self.name, ) # we are done elif isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): grper = pprint_thing(self.grouper) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouper = None # Try for sanity raise AssertionError(errmsg) if isinstance(self.grouper, np.ndarray): # if we have a date/time-like grouper, make sure that we have # Timestamps like self.grouper = sanitize_to_nanoseconds(self.grouper)