def na_op(x, y): try: result = op(x, y) except TypeError: if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) if isinstance(y, (np.ndarray, ABCSeries)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: x = _ensure_object(x) y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: # let null fall thru if not isna(y): y = bool(y) try: result = lib.scalar_binop(x, y, op) except: msg = ("cannot compare a dtyped [{dtype}] array " "with a scalar of type [{type}]" ).format(dtype=x.dtype, type=type(y).__name__) raise TypeError(msg) return result
def test_is_bool_dtype(): assert not com.is_bool_dtype(int) assert not com.is_bool_dtype(str) assert not com.is_bool_dtype(pd.Series([1, 2])) assert not com.is_bool_dtype(np.array(['a', 'b'])) assert not com.is_bool_dtype(pd.Index(['a', 'b'])) assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool) assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False]))
def _evaluate_compare(self, other, op): """ We have been called because a comparison between 8 aware arrays. numpy >= 1.11 will now warn about NaT comparisons """ # Called by comparison methods when comparing datetimelike # with datetimelike if not isinstance(other, type(self)): # coerce to a similar object if not is_list_like(other): # scalar other = [other] elif lib.is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly mask = (self._isnan) | (other._isnan) filler = iNaT if is_bool_dtype(result): filler = False result[mask] = filler return result
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as PeriodIndex stores internal data as int dtype Replace this to __numpy_ufunc__ in future version """ if isinstance(context, tuple) and len(context) > 0: func = context[0] if func is np.add: pass elif func is np.subtract: name = self.name left = context[1][0] right = context[1][1] if (isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex)): name = left.name if left.name == right.name else None return Index(result, name=name) elif isinstance(left, Period) or isinstance(right, Period): return Index(result, name=name) elif isinstance(func, np.ufunc): if 'M->M' not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" # This should be TypeError, but TypeError cannot be raised # from here because numpy catches. raise ValueError(msg.format(func.__name__)) if is_bool_dtype(result): return result # the result is object dtype array of Period # cannot pass _simple_new as it is return type(self)(result, freq=self.freq, name=self.name)
def nanall(values, axis=None, skipna=True, mask=None): """ Check if all elements along an axis evaluate to True. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : bool Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanall(s) True >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 0]) >>> nanops.nanall(s) False """ if is_bool_dtype(values.dtype) and mask is None: # Assume np.bool cannot store NaNs skipna = False values, _, _, _, _ = _get_values(values, skipna, True, copy=skipna, mask=mask, compute_mask=False) return values.all(axis)
def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) if isna(other): result.fill(nat_result) else: if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == iNaT else: o_mask = other.view('i8') == iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = func(other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = TimedeltaIndex(other).values result = func(other) result = com._values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def _evaluate_compare(self, other, op): result = self._eadata._evaluate_compare(other, op) if is_bool_dtype(result): return result try: return Index(result) except TypeError: return result
def _evaluate_compare(self, other, op): result = DatetimeLikeArrayMixin._evaluate_compare(self, other, op) if is_bool_dtype(result): return result try: return Index(result) except TypeError: return result
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None, compute_mask=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if skipna: compute_mask = True if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None and compute_mask: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ if is_bool(value) or is_bool_dtype(value): # force conversion to object # so we don't lose the bools raise TypeError return value
def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): dtype = np.bool if is_bool_dtype(dtype): # fill_value may be np.bool_ fill_value = bool(fill_value) return SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype)
def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) sp_values = astype_nansafe(self.sp_values, dtype, copy=copy) try: if is_bool_dtype(dtype): # to avoid np.bool_ dtype fill_value = bool(self.fill_value) else: fill_value = dtype.type(self.fill_value) except ValueError: msg = 'unable to coerce current fill_value {0} to {1} dtype' raise ValueError(msg.format(self.fill_value, dtype)) return self._simple_new(sp_values, self.sp_index, fill_value=fill_value)
def f(x, y): xmask = isna(x) ymask = isna(y) mask = xmask | ymask with np.errstate(all='ignore'): result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype('O') np.putmask(result, mask, np.nan) return result
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 tm.skip_if_no_package('scipy') # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(3, dtype=dtype) # GH 16179 arr[0, 1] = dtype(2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = pd.SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(object) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal assert dict(sdf.to_coo().todok()) == dict(spm.todok()) # Ensure dtype is preserved if possible was_upcast = ((fill_value is None or is_float(fill_value)) and not is_object_dtype(dtype) and not is_float_dtype(dtype)) res_dtype = (bool if is_bool_dtype(dtype) else float if was_upcast else dtype) tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) assert sdf.to_coo().dtype == res_dtype # However, adding a str column results in an upcast to object sdf['strings'] = np.arange(len(sdf)).astype(str) assert sdf.to_coo().dtype == np.object_
def is_bool_indexer(key): # type: (Any) -> bool """ Check whether `key` is a valid boolean indexer. Parameters ---------- key : Any Only list-likes may be considered boolean indexers. All other types are not considered a boolean indexer. For array-like input, boolean ndarrays or ExtensionArrays with ``_is_boolean`` set are considered boolean indexers. Returns ------- bool Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. """ na_msg = 'cannot index with vector containing NA / NaN values' if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays if is_extension_array_dtype(key.dtype): if np.any(key.isna()): raise ValueError(na_msg) return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def __init__(self, values, mask, copy=False): if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): raise TypeError("values should be integer numpy array. Use " "the 'integer_array' function instead") if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): raise TypeError("mask should be boolean numpy array. Use " "the 'integer_array' function instead") if copy: values = values.copy() mask = mask.copy() self._data = values self._mask = mask
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = com.values_from_object(values) if mask is None: if isfinite: mask = _isfinite(values) else: mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max
def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name.startswith('__'): # e.g. __eq__ --> eq name = name[2:-2] if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): dtype = np.bool fill_value = lib.item_from_zerodim(fill_value) if is_bool_dtype(dtype): # fill_value may be np.bool_ fill_value = bool(fill_value) return SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype)
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if is_categorical_dtype(dtype): typ = 'category' elif is_sparse(arr): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' elif is_datetimetz(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): typ = 'timedelta' elif is_object_dtype(dtype): typ = 'object' elif is_bool_dtype(dtype): typ = 'bool' elif is_period_dtype(dtype): typ = str(arr.dtype) elif is_interval_dtype(dtype): typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) return typs
def _ensure_numeric(x): if isinstance(x, np.ndarray): if is_integer_dtype(x) or is_bool_dtype(x): x = x.astype(np.float64) elif is_object_dtype(x): try: x = x.astype(np.complex128) except: x = x.astype(np.float64) else: if not np.any(x.imag): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): try: x = float(x) except Exception: try: x = complex(x) except Exception: raise TypeError('Could not convert %s to numeric' % str(x)) return x
def __getitem__(self, key): if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") key = key[0] if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): data_slice = self.values[key] elif isinstance(key, slice): # special case to preserve dtypes if key == slice(None): return self.copy() # TODO: this logic is surely elsewhere # TODO: this could be more efficient indices = np.arange(len(self), dtype=np.int32)[key] return self.take(indices) else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the # key's fill_value for True / False, and then do an intersection # on the indicies of the sp_values. if isinstance(key, SparseArray): if is_bool_dtype(key): key = key.to_dense() else: key = np.asarray(key) if com.is_bool_indexer(key) and len(self) == len(key): # TODO(numpy 1.11): Remove this asarray. # Old NumPy didn't treat array-like as boolean masks. key = np.asarray(key) return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, '__len__'): return self.take(key) else: raise ValueError("Cannot slice with '{}'".format(key)) return type(self)(data_slice, kind=self.kind)
def __getitem__(self, key): """ """ if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): data_slice = self.values[key] else: if isinstance(key, SparseArray): if is_bool_dtype(key): key = key.to_dense() else: key = np.asarray(key) if hasattr(key, '__len__') and len(self) != len(key): return self.take(key) else: data_slice = self.values[key] return self._constructor(data_slice)
def coerce_to_array(values, dtype=None, mask=None, copy: bool = False) -> tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like dtype : float dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is floating numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): # Avoid DeprecationWarning from NumPy about np.dtype("Float64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), FloatingDtype): try: dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, FloatingArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass elif inferred_type == "boolean": raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_float_dtype(dtype): values = np.array(values, dtype=float, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") if values.ndim != 1: raise TypeError("values must be a 1D list-like") if mask is None: mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("float64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here # TODO should this be a safe cast? if mask.any(): values = values.copy() values[mask] = np.nan values = values.astype(dtype, copy=False) # , casting="safe") return values, mask
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : np.dtype dtype for values dtype_max : np.dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # np.where call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) dtype = values.dtype datetimelike = False if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) datetimelike = True dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna and (mask is not None) and (fill_value is not None): if mask.any(): if dtype_ok or datetimelike: values = values.copy() np.putmask(values, mask, fill_value) else: # np.where will promote if needed values = np.where(~mask, values, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.dtype(np.int64) elif is_float_dtype(dtype): dtype_max = np.dtype(np.float64) return values, mask, dtype, dtype_max, fill_value
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values.dtype): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(values.dtype): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations") if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? # TODO(EA2D):kludge can be avoided when 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how) return result, names
def test_is_bool_dtype_sparse(): result = is_bool_dtype(Series(SparseArray([True, False]))) assert result is True
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): # lib.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = lib.values_from_object(values) dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> np.ndarray: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(values, how) if is_extension_array_dtype(values.dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(ensure_float(values)) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) result = maybe_downcast_to_dtype(result, dtype) return result
def is_bool(self) -> bool: return is_bool_dtype(self.pd_dtype)
def _str_map(self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True): # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. from pandas.arrays import ( BooleanArray, IntegerArray, ) if dtype is None: dtype = self.dtype if na_value is None: na_value = self.dtype.na_value mask = isna(self) arr = np.asarray(self) if is_integer_dtype(dtype) or is_bool_dtype(dtype): constructor: type[IntegerArray] | type[BooleanArray] if is_integer_dtype(dtype): constructor = IntegerArray else: constructor = BooleanArray na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value, # error: Argument 1 to "dtype" has incompatible type # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected # "Type[object]" dtype=np.dtype(dtype), # type: ignore[arg-type] ) if not na_value_is_na: mask[:] = False return constructor(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype result = lib.map_infer_mask(arr, f, mask.view("uint8"), convert=False, na_value=na_value) result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) return type(self)(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8"))
def test_is_bool_dtype_sparse(): result = is_bool_dtype(pd.SparseSeries([True, False])) assert result is True
def test_is_bool_dtype(dtype, expected): result = is_bool_dtype(dtype) assert result is expected
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError("{} dtype not supported".format( values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} operations".format( how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} operations".format( how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def coerce_to_array( values, dtype, mask=None, copy: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask Parameters ---------- values : 1D list-like dtype : integer dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and (dtype.startswith("Int") or dtype.startswith("UInt")): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError(f"invalid dtype specified {dtype}") if isinstance(values, IntegerArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in [ "floating", "integer", "mixed-integer", "integer-na", "mixed-integer-float", ]: raise TypeError( f"{values.dtype} cannot be converted to an IntegerDtype") elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to an IntegerDtype") if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("int64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 values = safe_cast(values, dtype, copy=False) else: values = safe_cast(values, dtype, copy=False) return values, mask
def _is_boolean(self): return is_bool_dtype(self.subtype)
def test_is_boolean(self, categories, expected): cat = Categorical(categories) assert cat.dtype._is_boolean is expected assert is_bool_dtype(cat) is expected assert is_bool_dtype(cat.dtype) is expected
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used"), ParserWarning, stacklevel=7, ) try: values = lib.map_infer(values, conv_f) except ValueError: mask = algorithms.isin(values, list(na_values)).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = ( sorted_values.reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") elif is_bool_dtype(values): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def wrapper(self, other): result = getattr(TimedeltaArrayMixin, opname)(self, other) if is_bool_dtype(result): # support of bool dtype indexers return result return Index(result)
def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = self.mask_all # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values # matching values? When that holds, we can slice instead # of take (in particular for EAs) new_values = (sorted_values.reshape( length, width, stride).swapaxes(1, 2).reshape(result_shape)) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) name = np.dtype(dtype).name else: dtype, fill_value = maybe_promote(values.dtype, fill_value) if isinstance(dtype, ExtensionDtype): # GH#41875 cls = dtype.construct_array_type() new_values = cls._empty(result_shape, dtype=dtype) new_values[:] = fill_value name = dtype.name else: new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) name = np.dtype(dtype).name new_mask = np.zeros(result_shape, dtype=bool) # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values.dtype): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") elif is_bool_dtype(values.dtype): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values.dtype): # view as datetime64 so we can wrap in DatetimeArray and use # DTA's view method new_values = new_values.view("M8[ns]") new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask
def test_is_bool_dtype_numpy_error(): # GH39010 assert not com.is_bool_dtype("0 - Name")
def test_is_bool_dtype(): assert not com.is_bool_dtype(int) assert not com.is_bool_dtype(str) assert not com.is_bool_dtype(pd.Series([1, 2])) assert not com.is_bool_dtype(np.array(["a", "b"])) assert not com.is_bool_dtype(pd.Index(["a", "b"])) assert not com.is_bool_dtype("Int64") assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool_) assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) assert com.is_bool_dtype(pd.BooleanDtype()) assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) assert com.is_bool_dtype("boolean")
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = (self.sorted_values .reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view('i8') new_values = new_values.view('i8') name = 'int64' elif is_bool_dtype(values): sorted_values = sorted_values.astype('object') new_values = new_values.astype('object') name = 'object' else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{}".format(name)) f(sorted_values, mask.view('u1'), stride, length, width, new_values, new_mask.view('u1')) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: """ Check if `indexer` is a valid array indexer for `array`. For a boolean mask, `array` and `indexer` are checked to have the same length. The dtype is validated, and if it is an integer or boolean ExtensionArray, it is checked if there are missing values present, and it is converted to the appropriate numpy array. Other dtypes will raise an error. Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed through as is. .. versionadded:: 1.0.0 Parameters ---------- array : array-like The array that is being indexed (only used for the length). indexer : array-like or list-like The array-like that's used to index. List-like input that is not yet a numpy array or an ExtensionArray is converted to one. Other input types are passed through as is Returns ------- numpy.ndarray The validated indexer as a numpy array that can be used to index. Raises ------ IndexError When the lengths don't match. ValueError When `indexer` cannot be converted to a numpy ndarray to index (e.g. presence of missing values). See Also -------- api.types.is_bool_dtype : Check if `key` is of boolean dtype. Examples -------- When checking a boolean mask, a boolean ndarray is returned when the arguments are all valid. >>> mask = pd.array([True, False]) >>> arr = pd.array([1, 2]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) An IndexError is raised when the lengths don't match. >>> mask = pd.array([True, False, True]) >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... IndexError: Boolean index has wrong length: 3 instead of 2. NA values in a boolean array are treated as False. >>> mask = pd.array([True, pd.NA]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) A numpy boolean mask will get passed through (if the length is correct): >>> mask = np.array([True, False]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) Similarly for integer indexers, an integer ndarray is returned when it is a valid indexer, otherwise an error is (for integer indexers, a matching length is not required): >>> indexer = pd.array([0, 2], dtype="Int64") >>> arr = pd.array([1, 2, 3]) >>> pd.api.indexers.check_array_indexer(arr, indexer) array([0, 2]) >>> indexer = pd.array([0, pd.NA], dtype="Int64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... ValueError: Cannot index with an integer indexer containing NA values For non-integer/boolean dtypes, an appropriate error is raised: >>> indexer = np.array([0., 2.], dtype="float64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... IndexError: arrays used as indices must be of integer or boolean type """ from pandas.core.construction import array as pd_array # whathever is not an array-like is returned as-is (possible valid array # indexers that are not array-like: integer, slice, Ellipsis, None) # In this context, tuples are not considered as array-like, as they have # a specific meaning in indexing (multi-dimensional indexing) if is_list_like(indexer): if isinstance(indexer, tuple): return indexer else: return indexer # convert list-likes to array if not is_array_like(indexer): indexer = pd_array(indexer) if len(indexer) == 0: # empty list is converted to float array by pd.array indexer = np.array([], dtype=np.intp) dtype = indexer.dtype if is_bool_dtype(dtype): if is_extension_array_dtype(dtype): indexer = indexer.to_numpy(dtype=bool, na_value=False) else: indexer = np.asarray(indexer, dtype=bool) # GH26658 if len(indexer) != len(array): raise IndexError(f"Boolean index has wrong length: " f"{len(indexer)} instead of {len(array)}") elif is_integer_dtype(dtype): try: indexer = np.asarray(indexer, dtype=np.intp) except ValueError: raise ValueError( "Cannot index with an integer indexer containing NA values") else: raise IndexError( "arrays used as indices must be of integer or boolean type") return indexer
def f(self): result = fget(self) if is_bool_dtype(result): # return numpy array b/c there is no BoolIndex return result return Index(result, name=self.name)
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function( kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '{kind}{itemsize}'.format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _convert_to_ndarrays( self, dct: Mapping, na_values, na_fvalues, verbose: bool = False, converters=None, dtypes=None, ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if c in self._parse_date_cols: # GH#26203 Do not convert columns which get converted to dates # but replace nans to ensure to_datetime works mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) np.putmask(values, mask, np.nan) result[c] = values continue if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used."), ParserWarning, stacklevel=find_stack_level(), ) try: values = lib.map_infer(values, conv_f) except ValueError: # error: Argument 2 to "isin" has incompatible type "List[Any]"; # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" mask = algorithms.isin( values, list(na_values) # type: ignore[arg-type] ).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = (self.sorted_values .reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view('i8') new_values = new_values.view('i8') name = 'int64' elif is_bool_dtype(values): sorted_values = sorted_values.astype('object') new_values = new_values.astype('object') name = 'object' else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{name}".format(name=name)) f(sorted_values, mask.view('u1'), stride, length, width, new_values, new_mask.view('u1')) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def _cast_types(self, values, cast_type, column): """ Cast values to specified type Parameters ---------- values : ndarray cast_type : string or np.dtype dtype to cast values to column : string column name - used only for error reporting Returns ------- converted : ndarray """ if is_categorical_dtype(cast_type): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) if not is_object_dtype(values) and not known_cats: # TODO: this is for consistency with # c-parser which parses all categories # as strings values = astype_nansafe(values, np.dtype(str)) cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( cats, cats.get_indexer(values), cast_type, true_values=self.true_values) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): # ensure cast_type is an actual dtype and not a string cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: if is_bool_dtype(cast_type): return array_type._from_sequence_of_strings( values, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, ) else: return array_type._from_sequence_of_strings( values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err return values
def _is_boolean(self): from pandas.core.dtypes.common import is_bool_dtype return is_bool_dtype(self.categories)
def astype(self, dtype, copy: bool = True) -> ArrayLike: """ Cast to a NumPy array or ExtensionArray with 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- ndarray or ExtensionArray NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype. Raises ------ TypeError if incompatible type with an BooleanDtype, equivalent of same_kind casting """ from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) if isinstance(dtype, BooleanDtype): values, mask = coerce_to_array(self, copy=copy) if not copy: return self else: return BooleanArray(values, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): from pandas.core.arrays import IntegerArray return IntegerArray(self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False) # for integer, error if there are missing values if is_integer_dtype(dtype): if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value if is_float_dtype(dtype): na_value = np.nan # coerce return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)