def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = func(other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = TimedeltaIndex(other).values result = func(other) result = com._values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def test_isna_datetime(self): assert not isna(datetime.now()) assert notna(datetime.now()) idx = date_range('1/1/1990', periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notna(idx), exp) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isna(idx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) tm.assert_numpy_array_equal(mask, exp) # GH 9129 pidx = idx.to_period(freq='M') mask = isna(pidx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) tm.assert_numpy_array_equal(mask, exp) mask = isna(pidx[1:]) exp = np.zeros(len(mask), dtype=bool) tm.assert_numpy_array_equal(mask, exp)
def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = type(self)(other)._data result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def _get_op_result_fill_value(self, other, func): own_default = self.default_fill_value if isinstance(other, DataFrame): # i.e. called from _combine_frame other_default = getattr(other, 'default_fill_value', np.nan) # if the fill values are the same use them? or use a valid one if own_default == other_default: # TOOD: won't this evaluate as False if both are np.nan? fill_value = own_default elif np.isnan(own_default) and not np.isnan(other_default): fill_value = other_default elif not np.isnan(own_default) and np.isnan(other_default): fill_value = own_default else: fill_value = None elif isinstance(other, SparseSeries): # i.e. called from _combine_match_index # fill_value is a function of our operator if isna(other.fill_value) or isna(own_default): fill_value = np.nan else: fill_value = func(np.float64(own_default), np.float64(other.fill_value)) else: raise NotImplementedError(type(other)) return fill_value
def _combine_match_index(self, other, func, level=None): new_data = {} if level is not None: raise NotImplementedError("'level' argument is not supported") new_index = self.index.union(other.index) this = self if self.index is not new_index: this = self.reindex(new_index) if other.index is not new_index: other = other.reindex(new_index) for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator if isna(other.fill_value) or isna(self.default_fill_value): fill_value = np.nan else: fill_value = func(np.float64(self.default_fill_value), np.float64(other.fill_value)) return self._constructor( new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self)
def wrapper(self, other): if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): return ops.invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def test_isna_nat(self): result = isna([NaT]) exp = np.array([True]) tm.assert_numpy_array_equal(result, exp) result = isna(np.array([NaT], dtype=object)) exp = np.array([True]) tm.assert_numpy_array_equal(result, exp)
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not is_scalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if is_scalar(y) and isna(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not is_scalar(y) and needs_i8_conversion(y))): if is_scalar(y): mask = isna(x) y = libindex.convert_scalar(x, com._values_from_object(y)) else: mask = isna(x) | isna(y) y = y.view('i8') x = x.view('i8') try: with np.errstate(all='ignore'): result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def test_period(self): idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(idx) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) s = pd.Series(idx, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp)
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): if na_position not in {'first', 'last'}: raise ValueError('invalid na_position: {!r}'.format(na_position)) mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) if ascending and na_position == 'last': # NaN is coded as -1 and is listed in front after sorting sorted_idx = np.roll(sorted_idx, -cnt_null) elif not ascending and na_position == 'first': # NaN is coded as -1 and is listed in the end after sorting sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx with warnings.catch_warnings(): # https://github.com/pandas-dev/pandas/issues/25439 # can be removed once ExtensionArrays are properly handled by nargsort warnings.filterwarnings( "ignore", category=FutureWarning, message="Converting timezone-aware DatetimeArray to") items = np.asanyarray(items) idx = np.arange(len(items)) mask = isna(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def f(x, y): xmask = isna(x) ymask = isna(y) mask = xmask | ymask with np.errstate(all='ignore'): result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype('O') np.putmask(result, mask, np.nan) return result
def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if isinstance(other, (datetime, np.datetime64, compat.string_types)): if isinstance(other, (datetime, np.datetime64)): # GH#18435 strings get a pass from tzawareness compat self._assert_tzawareness_compat(other) try: other = _to_m8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif lib.is_scalar(other): return ops.invalid_comparison(self, other, op) else: if isinstance(other, list): # FIXME: This can break for object-dtype with mixed types other = type(self)(other) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) if is_object_dtype(other): result = op(self.astype('O'), np.array(other)) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) result = meth(self, np.asarray(other)) result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ dtype, values_to_mask = infer_dtype_from_array(values_to_mask) try: values_to_mask = np.array(values_to_mask, dtype=dtype) except Exception: values_to_mask = np.array(values_to_mask, dtype=object) na_mask = isna(values_to_mask) nonna = values_to_mask[~na_mask] mask = None for x in nonna: if mask is None: # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask = False else: mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask |= False else: mask |= arr == x if na_mask.any(): if mask is None: mask = isna(arr) else: mask |= isna(arr) # GH 21977 if mask is None: mask = np.zeros(arr.shape, dtype=bool) return mask
def insert(self, loc, item): """ Make new Index inserting new item at location Parameters ---------- loc : int item : object if not either a Python datetime or a numpy integer-like, returned Index dtype will be object rather than datetime. Returns ------- new_index : Index """ # try to convert if possible if _is_convertible_to_td(item): try: item = Timedelta(item) except Exception: pass elif is_scalar(item) and isna(item): # GH 18295 item = self._na_value freq = None if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): # check freq can be preserved on edge cases if self.freq is not None: if ((loc == 0 or loc == -len(self)) and item + self.freq == self[0]): freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = Timedelta(item).asm8.view(_TD_DTYPE) try: new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) return self._shallow_copy(new_tds, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, compat.string_types): return self.astype(object).insert(loc, item) raise TypeError( "cannot insert TimedeltaIndex with incompatible label")
def _evaluate_with_timedelta_like(self, other, op): if isinstance(other, ABCSeries): # GH#19042 return NotImplemented opstr = '__{opname}__'.format(opname=op.__name__).replace('__r', '__') # allow division by a timedelta if opstr in ['__div__', '__truediv__', '__floordiv__']: if _is_convertible_to_td(other): other = Timedelta(other) if isna(other): raise NotImplementedError( "division by pd.NaT not implemented") i8 = self.asi8 left, right = i8, other.value if opstr in ['__floordiv__']: result = op(left, right) else: result = op(left, np.float64(right)) result = self._maybe_mask_results(result, convert='float64') return result return NotImplemented
def is_na(self): if self.block is None: return True if not self.block._can_hold_na: return False # Usually it's enough to check but a small fraction of values to see if # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values if self.block.is_categorical: values_flat = values.categories elif is_sparse(self.block.values.dtype): return False elif self.block.is_extension: values_flat = values else: values_flat = values.ravel(order='K') total_len = values_flat.shape[0] chunk_len = max(total_len // 40, 1000) for i in range(0, total_len, chunk_len): if not isna(values_flat[i:i + chunk_len]).all(): return False return True
def __mul__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._data * other freq = None if self.freq is not None and not isna(other): freq = self.freq * other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self) and not is_timedelta64_dtype(other): # Exclude timedelta64 here so we correctly raise TypeError # for that instead of ValueError raise ValueError("Cannot multiply with unequal lengths") if is_object_dtype(other): # this multiplication will succeed only if all elements of other # are int or float scalars, so we will end up with # timedelta64[ns]-dtyped result result = [self[n] * other[n] for n in range(len(self))] result = np.array(result) return type(self)(result) # numpy will accept float or int dtype, raise TypeError for others result = self._data * other return type(self)(result)
def _hash_scalar(val, encoding='utf8', hash_key=None): """ Hash scalar value Returns ------- 1d uint64 numpy array of hash value, of length 1 """ if isna(val): # this is to be consistent with the _hash_categorical implementation return np.array([np.iinfo(np.uint64).max], dtype='u8') if getattr(val, 'tzinfo', None) is not None: # for tz-aware datetimes, we need the underlying naive UTC value and # not the tz aware object or pd extension type (as # infer_dtype_from_scalar would do) if not isinstance(val, tslibs.Timestamp): val = tslibs.Timestamp(val) val = val.tz_convert(None) dtype, val = infer_dtype_from_scalar(val) vals = np.array([val], dtype=dtype) return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False)
def shift(self, periods, freq=None, axis=0): if periods == 0: return self.copy() # no special handling of fill values yet if not isna(self.fill_value): shifted = self.to_dense().shift(periods, freq=freq, axis=axis) return shifted.to_sparse(fill_value=self.fill_value, kind=self.kind) if freq is not None: return self._constructor( self.sp_values, sparse_index=self.sp_index, index=self.index.shift(periods, freq), fill_value=self.fill_value).__finalize__(self) int_index = self.sp_index.to_int_index() new_indices = int_index.indices + periods start, end = new_indices.searchsorted([0, int_index.length]) new_indices = new_indices[start:end] new_sp_index = _make_index(len(self), new_indices, self.sp_index) arr = self.values._simple_new(self.sp_values[start:end].copy(), new_sp_index, fill_value=np.nan) return self._constructor(arr, index=self.index).__finalize__(self)
def insert(self, loc, item): """ Make new Index inserting new item at location. Follows Python list.append semantics for negative values Parameters ---------- loc : int item : object Returns ------- new_index : Index Raises ------ ValueError if the item is not in the categories """ code = self.categories.get_indexer([item]) if (code == -1) and not (is_scalar(item) and isna(item)): raise TypeError("cannot insert an item into a CategoricalIndex " "that is not already an existing category") codes = self.codes codes = np.concatenate((codes[:loc], code, codes[loc:])) return self._create_from_codes(codes)
def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): if isinstance(other, ABCSeries): # GH#19042 return NotImplemented # allow division by a timedelta if opstr in ['__div__', '__truediv__', '__floordiv__']: if _is_convertible_to_td(other): other = Timedelta(other) if isna(other): raise NotImplementedError( "division by pd.NaT not implemented") i8 = self.asi8 left, right = i8, other.value if reversed: left, right = right, left if opstr in ['__floordiv__']: result = left // right else: result = op(left, np.float64(right)) result = self._maybe_mask_results(result, convert='float64') return Index(result, name=self.name, copy=False) return NotImplemented
def _wrap_results(result, dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): tz = getattr(dtype, 'tz', None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan result = tslibs.Timestamp(result, tz=tz) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: result = np.nan # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = tslibs.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): """ Parameters ---------- values : ndarray[dtype] axis: int, optional skipna : bool, default True min_count: int, default 0 mask : ndarray[bool], optional nan-mask if known Returns ------- result : dtype Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s) 6.0 Returns -------- The product of all elements on a given axis. ( NaNs are treated as 1) """ if mask is None: mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) return _maybe_null_out(result, axis, mask, min_count=min_count)
def na_op(x, y): try: result = op(x, y) except TypeError: if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) if isinstance(y, (np.ndarray, ABCSeries)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: x = _ensure_object(x) y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: # let null fall thru if not isna(y): y = bool(y) try: result = lib.scalar_binop(x, y, op) except: msg = ("cannot compare a dtyped [{dtype}] array " "with a scalar of type [{type}]" ).format(dtype=x.dtype, type=type(y).__name__) raise TypeError(msg) return result
def insert(self, loc, item): """ Return a new IntervalIndex inserting new item at location. Follows Python list.append semantics for negative values. Only Interval objects and NA can be inserted into an IntervalIndex Parameters ---------- loc : int item : object Returns ------- new_index : IntervalIndex """ if isinstance(item, Interval): if item.closed != self.closed: raise ValueError('inserted item must be closed on the same ' 'side as the index') left_insert = item.left right_insert = item.right elif is_scalar(item) and isna(item): # GH 18295 left_insert = right_insert = item else: raise ValueError('can only insert Interval objects and NA into ' 'an IntervalIndex') new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right)
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'backfill_2d_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def from_tuples(cls, data, closed='right', copy=False, dtype=None): if len(data): left, right = [], [] else: # ensure that empty data keeps input dtype left = right = data for d in data: if isna(d): lhs = rhs = np.nan else: name = cls.__name__ try: # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] lhs, rhs = d except ValueError: msg = ('{name}.from_tuples requires tuples of ' 'length 2, got {tpl}').format(name=name, tpl=d) raise ValueError(msg) except TypeError: msg = ('{name}.from_tuples received an invalid ' 'item, {tpl}').format(name=name, tpl=d) raise TypeError(msg) left.append(lhs) right.append(rhs) return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)
def nanprod(values, axis=None, skipna=True, min_count=0): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) return _maybe_null_out(result, axis, mask, min_count=min_count)
def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) if isna(other): result.fill(nat_result) else: if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == iNaT else: o_mask = other.view('i8') == iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. Note: the caller is responsible for ensuring that numpy warnings are suppressed (with np.errstate(all="ignore")) if needed. Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame, Series, or Index. op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le} Returns ------- ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) rvalues = ensure_wrapped_if_datetimelike(right) rvalues = lib.item_from_zerodim(rvalues) if isinstance(rvalues, list): # We don't catch tuple here bc we may be comparing e.g. MultiIndex # to a tuple that represents a single entry, see test_compare_tuple_strs rvalues = np.asarray(rvalues) if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): raise ValueError( "Lengths must match to compare", lvalues.shape, rvalues.shape ) if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and not is_object_dtype(lvalues.dtype) ): # Call the method on lvalues res_values = op(lvalues, rvalues) elif is_scalar(rvalues) and isna(rvalues): # TODO: but not pd.NA? # numpy does not like comparisons vs None if op is operator.ne: res_values = np.ones(lvalues.shape, dtype=bool) else: res_values = np.zeros(lvalues.shape, dtype=bool) elif is_numeric_v_string_like(lvalues, rvalues): # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values
def isna(self): return isna(self._ndarray)
def isna(self): return isna(self.left)
def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. The result is replaced with the first N elements of other, where N is the number of True values in mask. If the length of other is shorter than N, other will be repeated. Parameters ---------- result : ndarray The destination array. This will be mutated in-place if no upcasting is necessary. mask : boolean ndarray other : scalar The source value. Returns ------- result : ndarray changed : bool Set to true if the result array was upcasted. Examples -------- >>> result, _ = maybe_upcast_putmask(np.arange(1,6), np.array([False, True, False, True, True]), np.arange(21,23)) >>> result array([1, 21, 3, 22, 21]) """ if not isinstance(result, np.ndarray): raise ValueError("The result input must be a ndarray.") if not is_scalar(other): # We _could_ support non-scalar other, but until we have a compelling # use case, we assume away the possibility. raise ValueError("other must be a scalar") if mask.any(): # Two conversions for date-like dtypes that can't be done automatically # in np.place: # NaN -> NaT # integer or integer array -> date-like array if result.dtype.kind in ["m", "M"]: if is_scalar(other): if isna(other): other = result.dtype.type("nat") elif is_integer(other): other = np.array(other, dtype=result.dtype) elif is_integer_dtype(other): other = np.array(other, dtype=result.dtype) def changeit(): # try to directly set by expanding our array to full # length of the boolean try: om = other[mask] except (IndexError, TypeError): # IndexError occurs in test_upcast when we have a boolean # mask of the wrong shape # TypeError occurs in test_upcast when `other` is a bool pass else: om_at = om.astype(result.dtype) if (om == om_at).all(): new_result = result.values.copy() new_result[mask] = om_at result[:] = new_result return result, False # we are forced to change the dtype of the result as the input # isn't compatible r, _ = maybe_upcast(result, fill_value=other, copy=True) np.place(r, mask, other) return r, True # we want to decide whether place will work # if we have nans in the False portion of our mask then we need to # upcast (possibly), otherwise we DON't want to upcast (e.g. if we # have values, say integers, in the success portion then it's ok to not # upcast) new_dtype, _ = maybe_promote(result.dtype, other) if new_dtype != result.dtype: # we have a scalar or len 0 ndarray # and its nan and we are changing some values if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1): if isna(other): return changeit() # we have an ndarray and the masking has nans in it else: if isna(other).any(): return changeit() try: np.place(result, mask, other) except TypeError: # e.g. int-dtype result and float-dtype other return changeit() return result, False
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: """ Cumulative function with skipna support. Parameters ---------- values : np.ndarray or ExtensionArray accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate} skipna : bool Returns ------- np.ndarray or ExtensionArray """ mask_a, mask_b = { np.cumprod: (1.0, np.nan), np.maximum.accumulate: (-np.inf, np.nan), np.cumsum: (0.0, np.nan), np.minimum.accumulate: (np.inf, np.nan), }[accum_func] # We will be applying this function to block values if values.dtype.kind in ["m", "M"]: # GH#30460, GH#29058 # numpy 1.18 started sorting NaTs at the end instead of beginning, # so we need to work around to maintain backwards-consistency. orig_dtype = values.dtype # We need to define mask before masking NaTs mask = isna(values) if accum_func == np.minimum.accumulate: # Note: the accum_func comparison fails as an "is" comparison y = values.view("i8") y[mask] = np.iinfo(np.int64).max changed = True else: y = values changed = False result = accum_func(y.view("i8"), axis=0) if skipna: result[mask] = iNaT elif accum_func == np.minimum.accumulate: # Restore NaTs that we masked previously nz = (~np.asarray(mask)).nonzero()[0] if len(nz): # everything up to the first non-na entry stays NaT result[:nz[0]] = iNaT if changed: # restore NaT elements y[mask] = iNaT # TODO: could try/finally for this? if isinstance(values, np.ndarray): result = result.view(orig_dtype) else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) vals[mask] = mask_a result = accum_func(vals, axis=0) result[mask] = mask_b else: result = accum_func(values, axis=0) return result
def f(x): if isna(x): return [np.nan] * len(columns) return x.components
def result_index(self): if len(self.binlabels) != 0 and isna(self.binlabels[0]): return self.binlabels[1:] return self.binlabels
def coerce_to_array(values, dtype=None, mask=None, copy: bool = False) -> tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like dtype : float dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is floating numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): # Avoid DeprecationWarning from NumPy about np.dtype("Float64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), FloatingDtype): try: dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, FloatingArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in [ "floating", "integer", "mixed-integer", "integer-na", "mixed-integer-float", ]: raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_float_dtype(dtype): values = np.array(values, dtype=float, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("float64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here # TODO should this be a safe cast? if mask.any(): values = values.copy() values[mask] = np.nan values = values.astype(dtype, copy=False) # , casting="safe") else: values = values.astype(dtype, copy=False) # , casting="safe") return values, mask
def __init__( self, obj: NDFrame, com: float | None = None, span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, min_periods: int | None = 0, adjust: bool = True, ignore_na: bool = False, axis: Axis = 0, times: str | np.ndarray | NDFrame | None = None, method: str = "single", *, selection=None, ): super().__init__( obj=obj, min_periods=1 if min_periods is None else max(int(min_periods), 1), on=None, center=False, closed=None, method=method, axis=axis, selection=selection, ) self.com = com self.span = span self.halflife = halflife self.alpha = alpha self.adjust = adjust self.ignore_na = ignore_na self.times = times if self.times is not None: if not self.adjust: raise NotImplementedError( "times is not supported with adjust=False.") if isinstance(self.times, str): warnings.warn( ("Specifying times as a string column label is deprecated " "and will be removed in a future version. Pass the column " "into times instead."), FutureWarning, stacklevel=find_stack_level(), ) self.times = self._selected_obj[self.times] if not is_datetime64_ns_dtype(self.times): raise ValueError("times must be datetime64[ns] dtype.") # error: Argument 1 to "len" has incompatible type "Union[str, ndarray, # NDFrameT, None]"; expected "Sized" if len(self.times) != len(obj): # type: ignore[arg-type] raise ValueError( "times must be the same length as the object.") if not isinstance(self.halflife, (str, datetime.timedelta)): raise ValueError( "halflife must be a string or datetime.timedelta object") if isna(self.times).any(): raise ValueError("Cannot convert NaT values to integer") self._deltas = _calculate_deltas(self.times, self.halflife) # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args if common.count_not_none(self.com, self.span, self.alpha) > 0: self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: self._com = 1.0 else: if self.halflife is not None and isinstance( self.halflife, (str, datetime.timedelta)): raise ValueError( "halflife can only be a timedelta convertible argument if " "times is not None.") # Without times, points are equally spaced self._deltas = np.ones(max(self.obj.shape[self.axis] - 1, 0), dtype=np.float64) self._com = get_center_of_mass( # error: Argument 3 to "get_center_of_mass" has incompatible type # "Union[float, Any, None, timedelta64, signedinteger[_64Bit]]"; # expected "Optional[float]" self.com, self.span, self.halflife, # type: ignore[arg-type] self.alpha, )
def coerce_to_array(values, mask=None, copy: bool = False) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ if isinstance(values, BooleanArray): if mask is not None: raise ValueError("cannot pass mask for BooleanArray input") values, mask = values._data, values._mask if copy: values = values.copy() mask = mask.copy() return values, mask mask_values = None if isinstance(values, np.ndarray) and values.dtype == np.bool_: if copy: values = values.copy() elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): mask_values = isna(values) values_bool = np.zeros(len(values), dtype=bool) values_bool[~mask_values] = values[~mask_values].astype(bool) if not np.all(values_bool[~mask_values].astype(values.dtype) == values[~mask_values]): raise TypeError("Need to pass bool-like values") values = values_bool else: values_object = np.asarray(values, dtype=object) inferred_dtype = lib.infer_dtype(values_object, skipna=True) integer_like = ("floating", "integer", "mixed-integer-float") if inferred_dtype not in ("boolean", "empty") + integer_like: raise TypeError("Need to pass bool-like values") mask_values = isna(values_object) values = np.zeros(len(values), dtype=bool) values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's if inferred_dtype in integer_like: if not np.all(values[~mask_values].astype(float) == values_object[~mask_values].astype(float)): raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) elif mask is None: mask = mask_values else: if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: if mask_values is not None: mask = mask | mask_values else: if copy: mask = mask.copy() else: mask = np.array(mask, dtype=bool) if mask_values is not None: mask = mask | mask_values if not values.ndim == 1: raise ValueError("values must be a 1D list-like") if not mask.ndim == 1: raise ValueError("mask must be a 1D list-like") return values, mask
def sanitize_array( data, index: Optional["Index"], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif isinstance(data, abc.Set): raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs): """ Passed off to scipy.interpolate.interp1d. method is scipy's kind. Returns an array interpolated at new_x. Add any new methods to the list in _clean_interp_method. """ extra = f"{method} interpolation requires SciPy." import_optional_dependency("scipy", extra=extra) from scipy import interpolate new_x = np.asarray(new_x) # ignores some kwargs that could be passed along. alt_methods = { "barycentric": interpolate.barycentric_interpolate, "krogh": interpolate.krogh_interpolate, "from_derivatives": _from_derivatives, "piecewise_polynomial": _from_derivatives, } if getattr(x, "is_all_dates", False): # GH 5975, scipy.interp1d can't handle datetime64s x, new_x = x._values.astype("i8"), new_x.astype("i8") if method == "pchip": alt_methods["pchip"] = interpolate.pchip_interpolate elif method == "akima": alt_methods["akima"] = _akima_interpolate elif method == "cubicspline": alt_methods["cubicspline"] = _cubicspline_interpolate interp1d_methods = [ "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", ] if method in interp1d_methods: if method == "polynomial": method = order terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error) new_y = terp(new_x) elif method == "spline": # GH #10633, #24014 if isna(order) or (order <= 0): raise ValueError( f"order needs to be specified and greater than 0; got order: {order}" ) terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) new_y = terp(new_x) else: # GH 7295: need to be able to write for some reason # in some circumstances: check all three if not x.flags.writeable: x = x.copy() if not y.flags.writeable: y = y.copy() if not new_x.flags.writeable: new_x = new_x.copy() method = alt_methods[method] new_y = method(x, y, new_x, **kwargs) return new_y
def nargsort( items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, mask: Optional[np.ndarray] = None, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. Adds ascending, na_position, and key parameters. (GH #6399, #5231, #27237) Parameters ---------- kind : str, default 'quicksort' ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None mask : Optional[np.ndarray], default None Passed when called by ExtensionArray.argsort. """ if key is not None: items = ensure_key_mapped(items, key) return nargsort( items, kind=kind, ascending=ascending, na_position=na_position, key=None, mask=mask, ) items = extract_array(items) if mask is None: mask = np.asarray(isna(items)) if is_extension_array_dtype(items): return items.argsort(ascending=ascending, kind=kind, na_position=na_position) else: items = np.asanyarray(items) idx = np.arange(len(items)) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == "last": indexer = np.concatenate([indexer, nan_idx]) elif na_position == "first": indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError(f"invalid na_position: {na_position}") return indexer
def interpolate_1d( xvalues: np.ndarray, yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, limit_direction: str = "forward", limit_area: Optional[str] = None, fill_value: Optional[Any] = None, bounds_error: bool = False, order: Optional[int] = None, **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == "time": if not getattr(xvalues, "is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError("time-weighted interpolation only works " "on Series or DataFrames with a " "DatetimeIndex") method = "values" valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError("Invalid limit_direction: expecting one of " f"{valid_limit_directions}, got '{limit_direction}'.") if limit_area is not None: valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError( f"Invalid limit_area: expecting one of {valid_limit_areas}, got " f"{limit_area}.") # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(find_valid_index(yvalues, "first"))) end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit preserve_nans: Union[List, Set] if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == "backward": preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == "inside": # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == "outside": # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() # xvalues to pass to NumPy/SciPy xvalues = getattr(xvalues, "values", xvalues) if method == "linear": inds = xvalues else: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype): inds = inds.view(np.int64) if method in ("values", "index"): if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) if method in NP_METHODS: # np.interp requires sorted X values, #21037 indexer = np.argsort(inds[valid]) result[invalid] = np.interp(inds[invalid], inds[valid][indexer], yvalues[valid][indexer]) else: result[invalid] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs, ) result[preserve_nans] = np.nan return result
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values cat = cast("Categorical", self._values) return cat.map(mapper) values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_nd(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr( self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self._values.astype(object) if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( values, f, isna(values).view(np.uint8)) elif na_action is None: map_f = lib.map_infer else: msg = ("na_action must either be 'ignore' or None, " f"{na_action} was passed") raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) return new_values
def isna(self): return isna(self._values)
def _is_na_fill_value(self): return isna(self.fill_value)
def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None: subarr = np.array(data, copy=False) # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype): try: subarr = _try_cast(data, True, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path # them) # GH#6140 subarr = sanitize_index(data, index, copy=copy) else: # we will try to copy be-definition here subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): if isinstance(data, ABCPandasArray): # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. subarr = data.to_numpy() else: subarr = data # everything else in this block must also handle ndarray's, # becuase we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) if copy: subarr = data.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype='int64') subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = com.asarray_tuplesafe(data, dtype=dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) if issubclass(dtype.type, str): return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError( f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: # allow frequency conversions # we return a float here! if dtype.kind == "m": mask = isna(arr) result = arr.astype(dtype).astype(np.float64) result[mask] = np.nan return result elif dtype == _TD_DTYPE: return arr.astype(_TD_DTYPE, copy=copy) raise TypeError( f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype( dtype, np.integer): if not np.isfinite(arr).all(): raise ValueError( "Cannot convert non-finite values (NA or inf) to integer") elif is_object_dtype(arr): # work around NumPy brokenness, #1987 if np.issubdtype(dtype.type, np.integer): return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe elif is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe(to_datetime(arr).values, dtype, copy=copy) elif is_timedelta64_dtype(dtype): from pandas import to_timedelta return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = (f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead.") raise ValueError(msg) if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.view(dtype)
def soft_convert_objects( values: np.ndarray, datetime: bool = True, numeric: bool = True, timedelta: bool = True, coerce: bool = False, copy: bool = True, ): """ if we have an object dtype, try to coerce dates and/or numbers """ validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: raise ValueError( "At least one of datetime, numeric or timedelta must be True.") elif conversion_count > 1 and coerce: raise ValueError("Only one of 'datetime', 'numeric' or " "'timedelta' can be True when when coerce=True.") if not is_object_dtype(values.dtype): # If not object, do not attempt conversion values = values.copy() if copy else values return values # If 1 flag is coerce, ensure 2 others are False if coerce: # Immediate return if coerce if datetime: from pandas import to_datetime return to_datetime(values, errors="coerce").to_numpy() elif timedelta: from pandas import to_timedelta return to_timedelta(values, errors="coerce").to_numpy() elif numeric: from pandas import to_numeric return to_numeric(values, errors="coerce") # Soft conversions if datetime: # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: values = lib.maybe_convert_objects(values, convert_datetime=True) except OutOfBoundsDatetime: pass if timedelta and is_object_dtype(values.dtype): # Object check to ensure only run if previous did not convert values = lib.maybe_convert_objects(values, convert_timedelta=True) if numeric and is_object_dtype(values.dtype): try: converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) except (ValueError, TypeError): pass else: # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values return values
def fmt(x): if isna(x): return 'NaN' return str(x)
def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. Parameters ---------- dtype : np.dtype or ExtensionDtype fill_value : scalar, default np.nan Returns ------- dtype Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like raise ValueError("fill_value must be a scalar") # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = fill_value.dtype.type("NaT", "ns") else: # we need to change to object type as our # fill_value is of object type if fill_value.dtype == np.object_: dtype = np.dtype(np.object_) fill_value = np.nan if dtype == np.object_ or dtype.kind in ["U", "S"]: # We treat string-like dtypes as object, and _always_ fill # with np.nan fill_value = np.nan dtype = np.dtype(np.object_) # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): if (is_integer(fill_value) or (is_float(fill_value) and not np.isnan(fill_value)) or isinstance(fill_value, str)): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: try: fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: if fv is NaT: # NaT has no `to_timedelta64` method fill_value = np.timedelta64("NaT", "ns") else: fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT elif not isinstance(fill_value, datetime): dtype = np.dtype(np.object_) elif fill_value.tzinfo is None: dtype = np.dtype(np.object_) elif not tz_compare(fill_value.tzinfo, dtype.tz): # TODO: sure we want to cast here? dtype = np.dtype(np.object_) elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) elif dtype.kind == "f": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.float64 and dtype is np.float32 dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): dtype = np.float64 fill_value = np.nan elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) fill_value = np.nan else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value
def _bins_to_cuts( x, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, dtype=None, duplicates: str = "raise", ordered: bool = True, ): if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument") elif labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) elif ordered and len(set(labels)) != len(labels): raise ValueError( "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa ) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical( labels, categories=labels if len(set(labels)) == len(labels) else None, ordered=ordered, ) # TODO: handle mismatch between categorical label order and pandas.cut order. np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8))
def coerce_to_array(values, dtype, mask=None, copy=False): """ Coerce the input values array to numpy arrays with a mask Parameters ---------- values : 1D list-like dtype : integer dtype mask : boolean 1D array, optional copy : boolean, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, 'dtype'): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: if (isinstance(dtype, str) and (dtype.startswith("Int") or dtype.startswith("UInt"))): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) if isinstance(values, IntegerArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == 'empty': values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in ['floating', 'integer', 'mixed-integer', 'mixed-integer-float']: raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype('int64') else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 values = safe_cast(values, dtype, copy=False) else: values = safe_cast(values, dtype, copy=False) return values, mask
def insert(self, loc, item): # treat NA values as nans: if is_scalar(item) and isna(item): item = self._na_value return super().insert(loc, item)
def hasnans(self): """ Return if I have any nans; enables various perf speedups. """ return bool(isna(self).any())
def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans return contains(self, key, container=self._engine)
def compare_or_regex_search(a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: np.ndarray) -> ArrayLike | bool: """ Compare two array_like inputs of the same shape or two scalar values Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters ---------- a : array_like b : scalar or regex pattern regex : bool mask : np.ndarray[bool] Returns ------- mask : array_like of bool """ if isna(b): return ~mask def _check_comparison_types(result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern): """ Raises an error if the two arrays (a,b) cannot be compared. Otherwise, returns the comparison result as expected. """ if is_scalar(result) and isinstance(a, np.ndarray): type_names = [type(a).__name__, type(b).__name__] type_names[0] = f"ndarray(dtype={a.dtype})" raise TypeError( f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" ) if not regex: op = lambda x: operator.eq(x, b) else: op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance( x, str) and isinstance(b, (str, Pattern)) else False) # GH#32621 use mask to avoid comparing to NAs if isinstance(a, np.ndarray): a = a[mask] if is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy return np.zeros(a.shape, dtype=bool) elif is_datetimelike_v_numeric(a, b): # GH#29553 avoid deprecation warnings from numpy _check_comparison_types(False, a, b) return False result = op(a) if isinstance(result, np.ndarray) and mask is not None: # The shape of the mask can differ to that of the result # since we may compare only a subset of a's or b's elements tmp = np.zeros(mask.shape, dtype=np.bool_) tmp[mask] = result result = tmp _check_comparison_types(result, a, b) return result