def kleene_or( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): """ Boolean ``or`` using Kleene logic. Values are NA where we have ``NA | NA`` or ``NA | False``. ``NA | True`` is considered True. Parameters ---------- left, right : ndarray, NA, or bool The values of the array. left_mask, right_mask : ndarray, optional The masks. Only one of these may be None, which implies that the associated `left` or `right` value is a scalar. Returns ------- result, mask: ndarray[bool] The result of the logical or, and the new mask. """ # To reduce the number of cases, we ensure that `left` & `left_mask` # always come from an array, not a scalar. This is safe, since # A | B == B | A if left_mask is None: return kleene_or(right, left, right_mask, left_mask) if not isinstance(left, np.ndarray): raise TypeError("Either `left` or `right` need to be a np.ndarray.") raise_for_nan(right, method="or") if right is libmissing.NA: result = left.copy() else: result = left | right if right_mask is not None: # output is unknown where (False & NA), (NA & False), (NA & NA) left_false = ~(left | left_mask) right_false = ~(right | right_mask) mask = ( (left_false & right_mask) | (right_false & left_mask) | (left_mask & right_mask) ) else: if right is True: mask = np.zeros_like(left_mask) elif right is libmissing.NA: mask = (~left & ~left_mask) | left_mask else: # False mask = left_mask.copy() return result, mask
def _fillna_prep(values, mask: np.ndarray | None = None) -> np.ndarray: # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d if mask is None: mask = isna(values) mask = mask.view(np.uint8) return mask
def kleene_and( left: bool | libmissing.NAType | np.ndarray, right: bool | libmissing.NAType | np.ndarray, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): """ Boolean ``and`` using Kleene logic. Values are ``NA`` for ``NA & NA`` or ``True & NA``. Parameters ---------- left, right : ndarray, NA, or bool The values of the array. left_mask, right_mask : ndarray, optional The masks. Only one of these may be None, which implies that the associated `left` or `right` value is a scalar. Returns ------- result, mask: ndarray[bool] The result of the logical xor, and the new mask. """ # To reduce the number of cases, we ensure that `left` & `left_mask` # always come from an array, not a scalar. This is safe, since # A & B == B & A if left_mask is None: return kleene_and(right, left, right_mask, left_mask) if not isinstance(left, np.ndarray): raise TypeError("Either `left` or `right` need to be a np.ndarray.") raise_for_nan(right, method="and") if right is libmissing.NA: result = np.zeros_like(left) else: result = left & right if right_mask is None: # Scalar `right` if right is libmissing.NA: mask = (left & ~left_mask) | left_mask else: mask = left_mask.copy() if right is False: # unmask everything mask[:] = False else: # unmask where either left or right is False left_false = ~(left | left_mask) right_false = ~(right | right_mask) mask = (left_mask & ~right_false) | (right_mask & ~left_false) return result, mask
def kleene_xor( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): """ Boolean ``xor`` using Kleene logic. This is the same as ``or``, with the following adjustments * True, True -> False * True, NA -> NA Parameters ---------- left, right : ndarray, NA, or bool The values of the array. left_mask, right_mask : ndarray, optional The masks. Only one of these may be None, which implies that the associated `left` or `right` value is a scalar. Returns ------- result, mask: ndarray[bool] The result of the logical xor, and the new mask. """ # To reduce the number of cases, we ensure that `left` & `left_mask` # always come from an array, not a scalar. This is safe, since # A ^ B == B ^ A if left_mask is None: return kleene_xor(right, left, right_mask, left_mask) if not isinstance(left, np.ndarray): raise TypeError("Either `left` or `right` need to be a np.ndarray.") raise_for_nan(right, method="xor") if right is libmissing.NA: result = np.zeros_like(left) else: result = left ^ right if right_mask is None: if right is libmissing.NA: mask = np.ones_like(left_mask) else: mask = left_mask.copy() else: mask = left_mask | right_mask return result, mask
def kleene_xor( left: bool | np.ndarray, right: bool | np.ndarray, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): """ Boolean ``xor`` using Kleene logic. This is the same as ``or``, with the following adjustments * True, True -> False * True, NA -> NA Parameters ---------- left, right : ndarray, NA, or bool The values of the array. left_mask, right_mask : ndarray, optional The masks. Only one of these may be None, which implies that the associated `left` or `right` value is a scalar. Returns ------- result, mask: ndarray[bool] The result of the logical xor, and the new mask. """ if left_mask is None: return kleene_xor(right, left, right_mask, left_mask) raise_for_nan(right, method="xor") if right is libmissing.NA: result = np.zeros_like(left) else: # error: Incompatible types in assignment (expression has type # "Union[bool, Any]", variable has type "ndarray") result = left ^ right # type: ignore[assignment] if right_mask is None: if right is libmissing.NA: mask = np.ones_like(left_mask) else: mask = left_mask.copy() else: mask = left_mask | right_mask return result, mask
def sample( obj_len: int, size: int, replace: bool, weights: np.ndarray | None, random_state: np.random.RandomState | np.random.Generator, ) -> np.ndarray: """ Randomly sample `size` indices in `np.arange(obj_len)` Parameters ---------- obj_len : int The length of the indices being considered size : int The number of values to choose replace : bool Allow or disallow sampling of the same row more than once. weights : np.ndarray[np.float64] or None If None, equal probability weighting, otherwise weights according to the vector normalized random_state: np.random.RandomState or np.random.Generator State used for the random sampling Returns ------- np.ndarray[np.intp] """ if weights is not None: weight_sum = weights.sum() if weight_sum != 0: weights = weights / weight_sum else: raise ValueError("Invalid weights: weights sum to zero") return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(np.intp, copy=False)
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: np.ndarray | None = None, **kwargs, ) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) func_uses_mask = cy_op.uses_mask() if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: return self._masked_ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) else: return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, mask=mask, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype = dtype.kind + "8" values = values.astype(dtype, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count if func_uses_mask: func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in cy_op.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]