def _isna_array(values: ArrayLike, inf_as_na: bool = False): """ Return an array indicating which values of the input array are NaN / NA. Parameters ---------- obj: ndarray or ExtensionArray The input array whose elements are to be checked. inf_as_na: bool Whether or not to treat infinite values as NA. Returns ------- array-like Array of boolean values denoting the NA status of each element. """ dtype = values.dtype if is_extension_array_dtype(dtype): if inf_as_na and is_categorical_dtype(dtype): result = libmissing.isnaobj_old(values.to_numpy()) else: result = values.isna() elif is_string_dtype(dtype): result = _isna_string_dtype(values, dtype, inf_as_na=inf_as_na) elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT else: if inf_as_na: result = ~np.isfinite(values) else: result = np.isnan(values) return result
def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: """ Ensure that an dtype array of some integer dtype has an int64 dtype if possible. If it's not possible, potentially because of overflow, convert the array to float64 instead. Parameters ---------- arr : array-like The array whose data type we want to enforce. copy: bool Whether to copy the original array or reuse it in place, if possible. Returns ------- out_arr : The input array cast as int64 if possible without overflow. Otherwise the input array cast to float64. Notes ----- If the array is explicitly of type uint64 the type will remain unchanged. """ # TODO: GH27506 potential bug with ExtensionArrays try: return arr.astype("int64", copy=copy, casting="safe") # type: ignore except TypeError: pass try: return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: return arr.astype("float64", copy=copy)
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if is_dtype_equal(arr.dtype, dtype): return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _ # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any, # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict, # Tuple[Any, Any]]]" [arg-type] arr = cast("SparseArray", arr) return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type] # astype_array includes ensure_wrapped_if_datetimelike return astype_array(arr, dtype=dtype, copy=False)
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if (is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer)): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if (isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object")): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = array(arr) if is_extension_array_dtype(dtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False)
def take_1d( arr: ArrayLike, indexer: npt.NDArray[np.intp], fill_value=None, allow_fill: bool = True, mask: npt.NDArray[np.bool_] | None = None, ) -> ArrayLike: """ Specialized version for 1D arrays. Differences compared to `take_nd`: - Assumes input array has already been converted to numpy array / EA - Assumes indexer is already guaranteed to be intp dtype ndarray - Only works for 1D arrays To ensure the lowest possible overhead. Note: similarly to `take_nd`, this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. Parameters ---------- arr : np.ndarray or ExtensionArray Input array. indexer : ndarray 1-D array of indices to take (validated indices, intp dtype). fill_value : any, default np.nan Fill value to replace -1 values with allow_fill : bool, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. mask : np.ndarray, optional, default None If `allow_fill` is True, and the mask (where indexer == -1) is already known, it can be passed to avoid recomputation. """ if not isinstance(arr, np.ndarray): # ExtensionArray -> dispatch to their method return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if not allow_fill: return arr.take(indexer) dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, fill_value, True, mask) # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value out = np.empty(indexer.shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info) func(arr, indexer, out, fill_value) return out
def _isna_array(values: ArrayLike, inf_as_na: bool = False): """ Return an array indicating which values of the input array are NaN / NA. Parameters ---------- obj: ndarray or ExtensionArray The input array whose elements are to be checked. inf_as_na: bool Whether or not to treat infinite values as NA. Returns ------- array-like Array of boolean values denoting the NA status of each element. """ dtype = values.dtype if is_extension_array_dtype(dtype): if inf_as_na and is_categorical_dtype(dtype): # error: Item "ndarray" of "Union[ExtensionArray, ndarray]" has no attribute # "to_numpy" result = libmissing.isnaobj_old( values.to_numpy() # type: ignore[union-attr] ) else: # error: Item "ndarray" of "Union[ExtensionArray, ndarray]" has no attribute # "isna" result = values.isna() # type: ignore[union-attr] elif is_string_dtype(dtype): # error: Argument 1 to "_isna_string_dtype" has incompatible type # "ExtensionArray"; expected "ndarray" # error: Argument 2 to "_isna_string_dtype" has incompatible type # "ExtensionDtype"; expected "dtype[Any]" result = _isna_string_dtype( values, dtype, inf_as_na=inf_as_na # type: ignore[arg-type] ) elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT else: if inf_as_na: # error: Argument 1 to "__call__" of "ufunc" has incompatible type # "ExtensionArray"; expected "Union[Union[int, float, complex, str, bytes, # generic], Sequence[Union[int, float, complex, str, bytes, generic]], # Sequence[Sequence[Any]], _SupportsArray]" result = ~np.isfinite(values) # type: ignore[arg-type] else: # error: Argument 1 to "__call__" of "ufunc" has incompatible type # "ExtensionArray"; expected "Union[Union[int, float, complex, str, bytes, # generic], Sequence[Union[int, float, complex, str, bytes, generic]], # Sequence[Sequence[Any]], _SupportsArray]" result = np.isnan(values) # type: ignore[arg-type] return result
def take_nd( arr: ArrayLike, indexer, axis: int = 0, fill_value=lib.no_default, allow_fill: bool = True, ) -> ArrayLike: """ Specialized Cython take which sets NaN values in one pass This dispatches to ``take`` defined on ExtensionArrays. It does not currently dispatch to ``SparseArray.take`` for sparse ``arr``. Note: this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. Parameters ---------- arr : np.ndarray or ExtensionArray Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indices are filed with fill_value axis : int, default 0 Axis to take from fill_value : any, default np.nan Fill value to replace -1 values with allow_fill : bool, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. Returns ------- subarray : np.ndarray or ExtensionArray May be the same type as the input, or cast to an ndarray. """ if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) if not isinstance(arr, np.ndarray): # i.e. ExtensionArray, # includes for EA to catch DatetimeArray, TimedeltaArray if not is_1d_only_ea_obj(arr): # i.e. DatetimeArray, TimedeltaArray arr = cast("NDArrayBackedExtensionArray", arr) return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis) return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = np.asarray(arr) return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
def take_1d( arr: ArrayLike, indexer: np.ndarray, fill_value=None, allow_fill: bool = True, ) -> ArrayLike: """ Specialized version for 1D arrays. Differences compared to `take_nd`: - Assumes input array has already been converted to numpy array / EA - Assumes indexer is already guaranteed to be int64 dtype ndarray - Only works for 1D arrays To ensure the lowest possible overhead. Note: similarly to `take_nd`, this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially be removed again if we don't end up with ArrayManager. """ if not isinstance(arr, np.ndarray): # ExtensionArray -> dispatch to their method # error: Argument 1 to "take" of "ExtensionArray" has incompatible type # "ndarray"; expected "Sequence[int]" return arr.take( indexer, # type: ignore[arg-type] fill_value=fill_value, allow_fill=allow_fill, ) if not allow_fill: return arr.take(indexer) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, None, fill_value, allow_fill) # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value out = np.empty(indexer.shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info) func(arr, indexer, out, fill_value) return out
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if is_dtype_equal(arr.dtype, dtype): return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _ # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any, # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict, # Tuple[Any, Any]]]" [arg-type] arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type] if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object") ): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = ensure_wrapped_if_datetimelike(arr) if isinstance(dtype, ExtensionDtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return pd_array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) return arr.astype(dtype, copy=False)
def is_inferred_bool_dtype(arr: ArrayLike) -> bool: """ Check if this is a ndarray[bool] or an ndarray[object] of bool objects. Parameters ---------- arr : np.ndarray or ExtensionArray Returns ------- bool Notes ----- This does not include the special treatment is_bool_dtype uses for Categorical. """ if not isinstance(arr, np.ndarray): return False dtype = arr.dtype if dtype == np.dtype(bool): return True elif dtype == np.dtype("object"): return lib.is_bool_array(arr.ravel("K")) return False
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if is_dtype_equal(arr.dtype, dtype): return arr if is_sparse(arr) and not is_sparse(dtype): # TODO(2.0): remove special case once SparseArray.astype deprecation # is enforced. # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _ # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any, # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict, # Tuple[Any, Any]]]" [arg-type] arr = cast("SparseArray", arr) return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type] # astype_array includes ensure_wrapped_if_datetimelike return astype_array(arr, dtype=dtype, copy=False)
def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: """ Cast array (ndarray or ExtensionArray) to the new dtype. Parameters ---------- values : ndarray or ExtensionArray dtype : dtype object copy : bool, default False copy if indicated Returns ------- ndarray or ExtensionArray """ if ( values.dtype.kind in ["m", "M"] and dtype.kind in ["i", "u"] and isinstance(dtype, np.dtype) and dtype.itemsize != 8 ): # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" raise TypeError(msg) if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) if is_dtype_equal(values.dtype, dtype): if copy: return values.copy() return values if not isinstance(values, np.ndarray): # i.e. ExtensionArray values = values.astype(dtype, copy=copy) else: values = astype_nansafe(values, dtype, copy=copy) # in pandas we don't store numpy str dtypes, so convert to object if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): values = np.array(values, dtype=object) return values
def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike: """ If we have a length-1 array and an index describing how long we expect the result to be, repeat the array. """ if index is not None: if 1 == len(arr) != len(index): arr = arr.repeat(len(index)) return arr
def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray: """ Ensure that an dtype array of some integer dtype has an int64 dtype if possible. If it's not possible, potentially because of overflow, convert the array to float64 instead. Parameters ---------- arr : array-like The array whose data type we want to enforce. copy: bool Whether to copy the original array or reuse it in place, if possible. Returns ------- out_arr : The input array cast as int64 if possible without overflow. Otherwise the input array cast to float64. Notes ----- If the array is explicitly of type uint64 the type will remain unchanged. """ # TODO: GH27506 potential bug with ExtensionArrays try: # error: Unexpected keyword argument "casting" for "astype" return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: pass try: # error: Unexpected keyword argument "casting" for "astype" return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: if is_extension_array_dtype(arr.dtype): # pandas/core/dtypes/common.py:168: error: Item "ndarray" of # "Union[ExtensionArray, ndarray]" has no attribute "to_numpy" [union-attr] return arr.to_numpy( # type: ignore[union-attr] dtype="float64", na_value=np.nan) return arr.astype("float64", copy=copy)
def array_equals(left: ArrayLike, right: ArrayLike) -> bool: """ ExtensionArray-compatible implementation of array_equivalent. """ if not is_dtype_equal(left.dtype, right.dtype): return False elif isinstance(left, ABCExtensionArray): return left.equals(right) else: return array_equivalent(left, right, dtype_equal=True)
def take_1d( arr: ArrayLike, indexer: npt.NDArray[np.intp], fill_value=None, allow_fill: bool = True, ) -> ArrayLike: """ Specialized version for 1D arrays. Differences compared to `take_nd`: - Assumes input array has already been converted to numpy array / EA - Assumes indexer is already guaranteed to be intp dtype ndarray - Only works for 1D arrays To ensure the lowest possible overhead. Note: similarly to `take_nd`, this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. """ indexer = ensure_platform_int(indexer) if not isinstance(arr, np.ndarray): # ExtensionArray -> dispatch to their method return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if not allow_fill: return arr.take(indexer) dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, fill_value, True) # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value out = np.empty(indexer.shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info) func(arr, indexer, out, fill_value) return out
def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... # Except for BooleanArray, this is equivalent to just # np.asarray(mask, dtype=bool) mask = mask.to_numpy(dtype=bool, na_value=False) mask = np.asarray(mask, dtype=bool) return mask
def _isna_array(values: ArrayLike, inf_as_na: bool = False): """ Return an array indicating which values of the input array are NaN / NA. Parameters ---------- obj: ndarray or ExtensionArray The input array whose elements are to be checked. inf_as_na: bool Whether or not to treat infinite values as NA. Returns ------- array-like Array of boolean values denoting the NA status of each element. """ dtype = values.dtype if not isinstance(values, np.ndarray): # i.e. ExtensionArray if inf_as_na and is_categorical_dtype(dtype): result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na) else: # error: Incompatible types in assignment (expression has type # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has # type "ndarray[Any, dtype[bool_]]") result = values.isna() # type: ignore[assignment] elif is_string_or_object_np_dtype(values.dtype): result = _isna_string_dtype(values, inf_as_na=inf_as_na) elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT else: if inf_as_na: result = ~np.isfinite(values) else: result = np.isnan(values) return result
def take_nd( arr: ArrayLike, indexer, axis: int = 0, out: Optional[np.ndarray] = None, fill_value=lib.no_default, allow_fill: bool = True, ) -> ArrayLike: """ Specialized Cython take which sets NaN values in one pass This dispatches to ``take`` defined on ExtensionArrays. It does not currently dispatch to ``SparseArray.take`` for sparse ``arr``. Parameters ---------- arr : np.ndarray or ExtensionArray Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indices are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with allow_fill : boolean, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. Returns ------- subarray : np.ndarray or ExtensionArray May be the same type as the input, or cast to an ndarray. """ if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) if not isinstance(arr, np.ndarray): # i.e. ExtensionArray, # includes for EA to catch DatetimeArray, TimedeltaArray return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = np.asarray(arr) return _take_nd_ndarray(arr, indexer, axis, out, fill_value, allow_fill)
def hash_array( vals: ArrayLike, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ) -> np.ndarray: """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray or ExtensionArray encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): vals = cast("Categorical", vals) return _hash_categorical(vals, encoding, hash_key) elif isinstance(vals, ABCExtensionArray): vals, _ = vals._values_for_factorize() elif not isinstance(vals, np.ndarray): # GH#42003 raise TypeError( "hash_array requires np.ndarray or ExtensionArray, not " f"{type(vals).__name__}. Use hash_pandas_object instead." ) return _hash_ndarray(vals, encoding, hash_key, categorize)
def hash_array( vals: ArrayLike, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ) -> np.ndarray: """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray or ExtensionArray encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): # error: Incompatible types in assignment (expression has type "Categorical", # variable has type "ndarray") vals = cast("Categorical", vals) # type: ignore[assignment] # error: Argument 1 to "_hash_categorical" has incompatible type "ndarray"; # expected "Categorical" return _hash_categorical(vals, encoding, hash_key) # type: ignore[arg-type] elif is_extension_array_dtype(dtype): # error: Incompatible types in assignment (expression has type "ndarray", # variable has type "ExtensionArray") # error: "ndarray" has no attribute "_values_for_factorize" vals, _ = vals._values_for_factorize() # type: ignore[assignment,attr-defined] # error: Argument 1 to "_hash_ndarray" has incompatible type "ExtensionArray"; # expected "ndarray" return _hash_ndarray(vals, encoding, hash_key, categorize) # type: ignore[arg-type]
def extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... # Except for BooleanArray, this is equivalent to just # np.asarray(mask, dtype=bool) # error: Incompatible types in assignment (expression has type "ndarray", # variable has type "ExtensionArray") mask = mask.to_numpy(dtype=bool, na_value=False) # type: ignore[assignment] # error: Incompatible types in assignment (expression has type "ndarray", variable # has type "ExtensionArray") mask = np.asarray(mask, dtype=bool) # type: ignore[assignment] # error: Incompatible return value type (got "ExtensionArray", expected "ndarray") return mask # type: ignore[return-value]
def quantile_compat(values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str) -> ArrayLike: """ Compute the quantiles of the given values for each quantile in `qs`. Parameters ---------- values : np.ndarray or ExtensionArray qs : np.ndarray[float64] interpolation : str Returns ------- np.ndarray or ExtensionArray """ if isinstance(values, np.ndarray): fill_value = na_value_for_dtype(values.dtype, compat=False) mask = isna(values) return quantile_with_mask(values, mask, fill_value, qs, interpolation) else: return values._quantile(qs, interpolation)
def take_1d( arr: ArrayLike, indexer: np.ndarray, fill_value=None, allow_fill: bool = True, ) -> ArrayLike: """ Specialized version for 1D arrays. Differences compared to take_nd: - Assumes input (arr, indexer) has already been converted to numpy array / EA - Only works for 1D arrays To ensure the lowest possible overhead. TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially be removed again if we don't end up with ArrayManager. """ if not isinstance(arr, np.ndarray): # ExtensionArray -> dispatch to their method return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, 0, None, fill_value, allow_fill) # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value out = np.empty(indexer.shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info) func(arr, indexer, out, fill_value) return out
def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: """ Cumulative function with skipna support. Parameters ---------- values : np.ndarray or ExtensionArray accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate} skipna : bool Returns ------- np.ndarray or ExtensionArray """ mask_a, mask_b = { np.cumprod: (1.0, np.nan), np.maximum.accumulate: (-np.inf, np.nan), np.cumsum: (0.0, np.nan), np.minimum.accumulate: (np.inf, np.nan), }[accum_func] # We will be applying this function to block values if values.dtype.kind in ["m", "M"]: # GH#30460, GH#29058 # numpy 1.18 started sorting NaTs at the end instead of beginning, # so we need to work around to maintain backwards-consistency. orig_dtype = values.dtype # We need to define mask before masking NaTs mask = isna(values) if accum_func == np.minimum.accumulate: # Note: the accum_func comparison fails as an "is" comparison y = values.view("i8") y[mask] = np.iinfo(np.int64).max changed = True else: y = values changed = False result = accum_func(y.view("i8"), axis=0) if skipna: result[mask] = iNaT elif accum_func == np.minimum.accumulate: # Restore NaTs that we masked previously nz = (~np.asarray(mask)).nonzero()[0] if len(nz): # everything up to the first non-na entry stays NaT result[: nz[0]] = iNaT if changed: # restore NaT elements y[mask] = iNaT # TODO: could try/finally for this? if isinstance(values, np.ndarray): result = result.view(orig_dtype) else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) vals[mask] = mask_a result = accum_func(vals, axis=0) result[mask] = mask_b else: result = accum_func(values, axis=0) return result
def astype_array_safe(values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise") -> ArrayLike: """ Cast array (ndarray or ExtensionArray) to the new dtype. This basically is the implementation for DataFrame/Series.astype and includes all custom logic for pandas (NaN-safety, converting str to object, not allowing ) Parameters ---------- values : ndarray or ExtensionArray dtype : str, dtype convertible copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object Returns ------- ndarray or ExtensionArray """ errors_legal_values = ("raise", "ignore") if errors not in errors_legal_values: invalid_arg = ( "Expected value of kwarg 'errors' to be one of " f"{list(errors_legal_values)}. Supplied value is '{errors}'") raise ValueError(invalid_arg) if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): msg = (f"Expected an instance of {dtype.__name__}, " "but got the class instead. Try instantiating 'dtype'.") raise TypeError(msg) dtype = pandas_dtype(dtype) if isinstance(dtype, PandasDtype): # Ensure we don't end up with a PandasArray dtype = dtype.numpy_dtype if (is_datetime64_dtype(values.dtype) # need to do np.dtype check instead of is_datetime64_dtype # otherwise pyright complains and isinstance(dtype, np.dtype) and dtype.kind == "M" and not is_unitless(dtype) and not is_dtype_equal(dtype, values.dtype)): # unit conversion, we would re-cast to nanosecond, so this is # effectively just a copy (regardless of copy kwd) # TODO(2.0): remove special-case return values.copy() try: new_values = astype_array(values, dtype, copy=copy) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float if errors == "ignore": new_values = values else: raise return new_values
def astype_dt64_to_dt64tz( values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False ) -> DatetimeArray: # GH#33401 we have inconsistent behaviors between # Datetimeindex[naive].astype(tzaware) # Series[dt64].astype(tzaware) # This collects them in one place to prevent further fragmentation. from pandas.core.construction import ensure_wrapped_if_datetimelike values = ensure_wrapped_if_datetimelike(values) values = cast("DatetimeArray", values) aware = isinstance(dtype, DatetimeTZDtype) if via_utc: # Series.astype behavior # caller is responsible for checking this assert values.tz is None and aware dtype = cast(DatetimeTZDtype, dtype) if copy: # this should be the only copy values = values.copy() warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use ser.dt.tz_localize instead.", FutureWarning, stacklevel=find_stack_level(), ) # GH#33401 this doesn't match DatetimeArray.astype, which # goes through the `not via_utc` path return values.tz_localize("UTC").tz_convert(dtype.tz) else: # DatetimeArray/DatetimeIndex.astype behavior if values.tz is None and aware: dtype = cast(DatetimeTZDtype, dtype) warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use obj.tz_localize instead.", FutureWarning, stacklevel=find_stack_level(), ) return values.tz_localize(dtype.tz) elif aware: # GH#18951: datetime64_tz dtype but not equal means different tz dtype = cast(DatetimeTZDtype, dtype) result = values.tz_convert(dtype.tz) if copy: result = result.copy() return result elif values.tz is not None: warnings.warn( "Using .astype to convert from timezone-aware dtype to " "timezone-naive dtype is deprecated and will raise in a " "future version. Use obj.tz_localize(None) or " "obj.tz_convert('UTC').tz_localize(None) instead", FutureWarning, stacklevel=find_stack_level(), ) result = values.tz_convert("UTC").tz_localize(None) if copy: result = result.copy() return result raise NotImplementedError("dtype_equal case should be handled elsewhere")