def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: """ Construct an np.ndarray or ExtensionArray of the given dtype and shape holding all-NA values. """ if is_datetime64tz_dtype(dtype): # NaT here is analogous to dtype.na_value below i8values = np.full(shape, NaT.value) return DatetimeArray(i8values, dtype=dtype) elif is_1d_only_ea_dtype(dtype): dtype = cast(ExtensionDtype, dtype) cls = dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=dtype) nrows = shape[-1] taker = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value) elif isinstance(dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = dtype.construct_array_type() missing_arr = cls._empty(shape=shape, dtype=dtype) missing_arr[:] = dtype.na_value return missing_arr else: # NB: we should never get here with dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(shape, dtype=dtype) fill_value = _dtype_to_na_value(dtype) missing_arr.fill(fill_value) return missing_arr
def to_array(self, dtype: DtypeObj) -> ArrayLike: """ Helper function to create the actual all-NA array from the NullArrayProxy object. Parameters ---------- arr : NullArrayProxy dtype : the dtype for the resulting array Returns ------- np.ndarray or ExtensionArray """ if isinstance(dtype, ExtensionDtype): empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) indexer = -np.ones(self.n, dtype=np.intp) return empty.take(indexer, allow_fill=True) else: # when introducing missing values, int becomes float, bool becomes object dtype = ensure_dtype_can_hold_na(dtype) fill_value = na_value_for_dtype(dtype) arr = np.empty(self.n, dtype=dtype) arr.fill(fill_value) return ensure_wrapped_if_datetimelike(arr)
def _dtype_to_na_value(dtype: DtypeObj): """ Find the NA value to go with this dtype. """ if isinstance(dtype, ExtensionDtype): return dtype.na_value elif dtype.kind in ["m", "M"]: return dtype.type("NaT") elif dtype.kind in ["f", "c"]: return dtype.type("NaN") elif dtype.kind == "b": # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: return np.nan elif dtype.kind == "O": return np.nan raise NotImplementedError
def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): """ Find the NA value to go with this dtype. """ if is_extension_array_dtype(dtype): return dtype.na_value elif dtype.kind in ["m", "M"]: return dtype.type("NaT") elif dtype.kind in ["f", "c"]: return dtype.type("NaN") elif dtype.kind == "b": return None elif dtype.kind in ["i", "u"]: if not has_none_blocks: return None return np.nan elif dtype.kind == "O": return np.nan raise NotImplementedError
def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): """ Find the NA value to go with this dtype. """ if is_extension_array_dtype(dtype): # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no # attribute "na_value" return dtype.na_value # type: ignore[union-attr] elif dtype.kind in ["m", "M"]: return dtype.type("NaT") elif dtype.kind in ["f", "c"]: return dtype.type("NaN") elif dtype.kind == "b": return None elif dtype.kind in ["i", "u"]: if not has_none_blocks: return None return np.nan elif dtype.kind == "O": return np.nan raise NotImplementedError
def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype compat : bool, default True Returns ------- np.dtype or a pandas dtype Examples -------- >>> na_value_for_dtype(np.dtype('int64')) 0 >>> na_value_for_dtype(np.dtype('int64'), compat=False) nan >>> na_value_for_dtype(np.dtype('float64')) nan >>> na_value_for_dtype(np.dtype('bool')) False >>> na_value_for_dtype(np.dtype('datetime64[ns]')) numpy.datetime64('NaT') """ if is_extension_array_dtype(dtype): # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no # attribute "na_value" return dtype.na_value # type: ignore[union-attr] elif needs_i8_conversion(dtype): return dtype.type("NaT", "ns") elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): if compat: return 0 return np.nan elif is_bool_dtype(dtype): if compat: return False return np.nan return np.nan
def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype compat : bool, default True Returns ------- np.dtype or a pandas dtype Examples -------- >>> na_value_for_dtype(np.dtype('int64')) 0 >>> na_value_for_dtype(np.dtype('int64'), compat=False) nan >>> na_value_for_dtype(np.dtype('float64')) nan >>> na_value_for_dtype(np.dtype('bool')) False >>> na_value_for_dtype(np.dtype('datetime64[ns]')) numpy.datetime64('NaT') """ if isinstance(dtype, ExtensionDtype): return dtype.na_value elif needs_i8_conversion(dtype): return dtype.type("NaT", "ns") elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): if compat: return 0 return np.nan elif is_bool_dtype(dtype): if compat: return False return np.nan return np.nan
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish empty_dtype = cast(np.dtype, empty_dtype) missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype(empty_dtype): if self.block is None: # TODO(EA2D): special case unneeded with 2D EAs return DatetimeArray(np.full(self.shape[1], fill_value.value), dtype=empty_dtype) elif getattr(self.block, "is_categorical", False): pass elif getattr(self.block, "is_extension", False): pass elif is_extension_array_dtype(empty_dtype): missing_arr = empty_dtype.construct_array_type( )._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def astype_nansafe( arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False ) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype or ExtensionDtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ # We get here with 0-dim from sparse arr = np.atleast_1d(arr) # dispatch on extension dtype if needed if isinstance(dtype, ExtensionDtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) elif not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and ( issubclass(dtype.type, str) or dtype == _dtype_obj ): from pandas.core.construction import ensure_wrapped_if_datetimelike arr = ensure_wrapped_if_datetimelike(arr) return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): shape = arr.shape if arr.ndim > 1: arr = arr.ravel() return lib.ensure_string_array( arr, skipna=skipna, convert_na_value=False ).reshape(shape) elif is_datetime64_dtype(arr.dtype): if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr.dtype): if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) elif dtype.kind == "m": return astype_td64_unit_conversion(arr, dtype, copy=copy) raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype): return _astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr.dtype): # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe if is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe( to_datetime(arr.ravel()).values.reshape(arr.shape), dtype, copy=copy, ) elif is_timedelta64_dtype(dtype): # bc we know arr.dtype == object, this is equivalent to # `np.asarray(to_timedelta(arr))`, but using a lower-level API that # does not require a circular import. return array_to_timedelta64(arr).view("m8[ns]").astype(dtype, copy=False) if dtype.name in ("datetime64", "timedelta64"): msg = ( f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead." ) raise ValueError(msg) if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.astype(dtype, copy=copy)
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) # error: Value of type variable "_DTypeScalar" of "dtype" cannot be # "object" if blk_dtype == np.dtype(object): # type: ignore[type-var] # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): # TODO(EA2D): special case unneeded with 2D EAs i8values = np.full(self.shape[1], fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif is_extension_array_dtype(empty_dtype): # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = empty_dtype.construct_array_type( ) # type: ignore[union-attr] missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish # error: Argument "dtype" to "empty" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, # Any]]]" missing_arr = np.empty( self.shape, dtype=empty_dtype # type: ignore[arg-type] ) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def astype_nansafe(arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype or ExtensionDtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no # attribute "reshape" return result.reshape(arr.shape) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) # dispatch on extension dtype if needed if isinstance(dtype, ExtensionDtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) elif not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and (issubclass(dtype.type, str) or dtype == _dtype_obj): from pandas.core.construction import ensure_wrapped_if_datetimelike arr = ensure_wrapped_if_datetimelike(arr) return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) elif is_datetime64_dtype(arr.dtype): if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError( f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr.dtype): if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) elif dtype.kind == "m": return astype_td64_unit_conversion(arr, dtype, copy=copy) raise TypeError( f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype( dtype, np.integer): return _astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr.dtype): # work around NumPy brokenness, #1987 if np.issubdtype(dtype.type, np.integer): return lib.astype_intsafe(arr, dtype) # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe elif is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe( to_datetime(arr).values, dtype, copy=copy, ) elif is_timedelta64_dtype(dtype): from pandas import to_timedelta return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = (f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead.") raise ValueError(msg) if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.astype(dtype, copy=copy)
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike if upcasted_na is None and not self.is_na: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = empty_dtype.construct_array_type() missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) missing_arr[:] = fill_value return missing_arr else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values