def get_value(self, series, key): """ we always want to get an index value, never a value """ if not is_scalar(key): raise InvalidIndexError k = com.values_from_object(key) loc = self.get_loc(k) new_values = com.values_from_object(series)[loc] return new_values
def wrapper(self, other): if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): return ops.invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: columns = _get_objs_combined_axis(data, sort=False) indexer_cache = {} aligned_values = [] for s in data: index = getattr(s, 'index', None) if index is None: index = ibase.default_index(len(s)) if id(index) in indexer_cache: indexer = indexer_cache[id(index)] else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = com.values_from_object(s) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) if values.dtype == np.object_: content = list(values.T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) else: return values.T, columns
def get_value(self, series: AnyArrayLike, key: Any): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing Parameters ---------- series : Series, ExtensionArray, Index, or ndarray 1-dimensional array to take values from key: : scalar The value of this index at the position of the desired value, otherwise the positional index of the desired value Returns ------- Any The element of the series at the position indicated by the key """ try: k = com.values_from_object(key) k = self._convert_scalar_indexer(k, kind='getitem') indexer = self.get_loc(k) return series.take([indexer])[0] except (KeyError, TypeError): pass # we might be a positional inexer return super().get_value(series, key)
def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = type(self)(other)._data result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None, compute_mask=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if skipna: compute_mask = True if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None and compute_mask: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def indices(self): """ dict {group name -> group indices} """ if len(self.groupings) == 1: return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] keys = [com.values_from_object(ping.group_index) for ping in self.groupings] return get_indexer_dict(label_list, keys)
def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ def stringify(value): if self.encoding is not None: encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) if kind == 'datetime64' or kind == 'datetime': if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) elif kind == 'timedelta64' or kind == 'timedelta': v = Timedelta(v, unit='s').value return TermValue(int(v), v, kind) elif meta == 'category': metadata = com.values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') # result returns 0 if v is first element or if v is not in metadata # check that metadata contains v if not result and v not in metadata: result = -1 return TermValue(result, result, 'integer') elif kind == 'integer': v = int(float(v)) return TermValue(v, v, kind) elif kind == 'float': v = float(v) return TermValue(v, v, kind) elif kind == 'bool': if isinstance(v, str): v = not v.strip().lower() in ['false', 'f', 'no', 'n', 'none', '0', '[]', '{}', ''] else: v = bool(v) return TermValue(v, v, kind) elif isinstance(v, str): # string quoting return TermValue(v, stringify(v), 'string') else: raise TypeError("Cannot compare {v} of type {typ} to {kind} column" .format(v=v, typ=type(v), kind=kind))
def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ def stringify(value): if self.encoding is not None: encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) if kind == u('datetime64') or kind == u('datetime'): if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) elif kind == u('timedelta64') or kind == u('timedelta'): v = Timedelta(v, unit='s').value return TermValue(int(v), v, kind) elif meta == u('category'): metadata = com.values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') # result returns 0 if v is first element or if v is not in metadata # check that metadata contains v if not result and v not in metadata: result = -1 return TermValue(result, result, u('integer')) elif kind == u('integer'): v = int(float(v)) return TermValue(v, v, kind) elif kind == u('float'): v = float(v) return TermValue(v, v, kind) elif kind == u('bool'): if isinstance(v, string_types): v = not v.strip().lower() in [u('false'), u('f'), u('no'), u('n'), u('none'), u('0'), u('[]'), u('{}'), u('')] else: v = bool(v) return TermValue(v, v, kind) elif isinstance(v, string_types): # string quoting return TermValue(v, stringify(v), u('string')) else: raise TypeError("Cannot compare {v} of type {typ} to {kind} column" .format(v=v, typ=type(v), kind=kind))
def indices(self): """ dict {group name -> group indices} """ if len(self.groupings) == 1: return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] keys = [ com.values_from_object(ping.group_index) for ping in self.groupings ] return get_indexer_dict(label_list, keys)
def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value dtype : np.dtype, optional copy : bool, default False Returns ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ arr = com.values_from_object(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isna(fill_value): mask = notna(arr) else: # cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) if is_object_dtype(arr.dtype): # element-wise equality check method in numpy doesn't treat # each element type, eg. 0, 0.0, and False are treated as # same. So we have to check the both of its type and value. mask = splib.make_mask_object_ndarray(arr, fill_value) else: mask = arr != fill_value length = len(arr) if length != len(mask): # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) # TODO: copy return sparsified_values, index, fill_value
def get_value_maybe_box(self, series, key): # needed to localize naive datetimes if self.tz is not None: key = Timestamp(key) if key.tzinfo is not None: key = key.tz_convert(self.tz) else: key = key.tz_localize(self.tz) elif not isinstance(key, Timestamp): key = Timestamp(key) values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) return com.maybe_box(self, values, series, key)
def cartesian_product(X): """ Numpy version of itertools.product. Sometimes faster (for large inputs)... Parameters ---------- X : list-like of list-likes Returns ------- product : list of ndarrays Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] See Also -------- itertools.product : Cartesian product of input iterables. Equivalent to nested for-loops. """ msg = "Input must be a list-like of list-likes" if not is_list_like(X): raise TypeError(msg) for x in X: if not is_list_like(x): raise TypeError(msg) if len(X) == 0: return [] lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) a[0] = 1 if cumprodX[-1] != 0: b = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) return [ np.tile( np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) ) for i, x in enumerate(X) ]
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = com.values_from_object(values) if mask is None: if isfinite: mask = _isfinite(values) else: mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def cartesian_product(X): """ Numpy version of itertools.product or pandas.compat.product. Sometimes faster (for large inputs)... Parameters ---------- X : list-like of list-likes Returns ------- product : list of ndarrays Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] See also -------- itertools.product : Cartesian product of input iterables. Equivalent to nested for-loops. pandas.compat.product : An alias for itertools.product. """ msg = "Input must be a list-like of list-likes" if not is_list_like(X): raise TypeError(msg) for x in X: if not is_list_like(x): raise TypeError(msg) if len(X) == 0: return [] lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) a[0] = 1 if cumprodX[-1] != 0: b = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i])) for i, x in enumerate(X)]
def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if isinstance(other, (datetime, np.datetime64, compat.string_types)): if isinstance(other, (datetime, np.datetime64)): # GH#18435 strings get a pass from tzawareness compat self._assert_tzawareness_compat(other) try: other = _to_m8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif lib.is_scalar(other): return ops.invalid_comparison(self, other, op) else: if isinstance(other, list): # FIXME: This can break for object-dtype with mixed types other = type(self)(other) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) if is_object_dtype(other): result = op(self.astype('O'), np.array(other)) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) result = meth(self, np.asarray(other)) result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def get_value(self, series, key): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ try: k = com.values_from_object(key) k = self._convert_scalar_indexer(k, kind='getitem') indexer = self.get_loc(k) return series.iloc[indexer] except (KeyError, TypeError): pass # we might be a positional inexer return super(CategoricalIndex, self).get_value(series, key)
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = com.values_from_object(values) if mask is None: if isfinite: mask = _isfinite(values) else: mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max
def get_value(self, series, key): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ s = com.values_from_object(series) try: return com.maybe_box(self, super(PeriodIndex, self).get_value(s, key), series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) vals = self._ndarray_values # if our data is higher resolution than requested key, slice if grp < freqn: iv = Period(asdt, freq=(grp, 1)) ord1 = iv.asfreq(self.freq, how='S').ordinal ord2 = iv.asfreq(self.freq, how='E').ordinal if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal return com.maybe_box(self, self._engine.get_value(s, key), series, key) else: raise KeyError(key) except TypeError: pass period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal return com.maybe_box(self, self._engine.get_value(s, key), series, key)
def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if isinstance(other, (datetime, np.datetime64, compat.string_types)): if isinstance(other, datetime): # GH#18435 strings get a pass from tzawareness compat self._assert_tzawareness_compat(other) other = _to_m8(other, tz=self.tz) result = meth(self, other) if isna(other): result.fill(nat_result) else: if isinstance(other, list): other = type(self)(other) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. if opname == '__eq__': return np.zeros(shape=self.shape, dtype=bool) elif opname == '__ne__': return np.ones(shape=self.shape, dtype=bool) raise TypeError('%s type object %s' % (type(other), str(other))) if is_datetimelike(other): self._assert_tzawareness_compat(other) result = meth(self, np.asarray(other)) result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def wrapper(self, other): if isinstance(other, str): try: other = self._scalar_from_string(other) except ValueError: # failed to parse as timedelta return invalid_comparison(self, other, op) if _is_convertible_to_td(other) or other is NaT: other = Timedelta(other) result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) elif not is_list_like(other): return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return invalid_comparison(self, other, op) result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def wrapper(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented if _is_convertible_to_td(other) or other is NaT: try: other = Timedelta(other) except ValueError: # failed to parse as timedelta return invalid_comparison(self, other, op) result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) elif not is_list_like(other): return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return invalid_comparison(self, other, op) result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def wrapper(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented if _is_convertible_to_td(other) or other is NaT: try: other = Timedelta(other) except ValueError: # failed to parse as timedelta return ops.invalid_comparison(self, other, op) result = op(self.view('i8'), other.value) if isna(other): result.fill(nat_result) elif not is_list_like(other): return ops.invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) result = op(self.view('i8'), other.view('i8')) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def __getitem__(self, key): try: return self.index.get_value(self, key) except InvalidIndexError: pass except KeyError: if isinstance(key, (int, np.integer)): return self._get_val_at(key) elif key is Ellipsis: return self raise Exception('Requested index not in this series!') except TypeError: # Could not hash item, must be array-like? pass key = com.values_from_object(key) if self.index.nlevels > 1 and isinstance(key, tuple): # to handle MultiIndex labels key = self.index.get_loc(key) return self._constructor(self.values[key], index=self.index[key]).__finalize__(self)
def nanvar(values, axis=None, skipna=True, ddof=1): values = com.values_from_object(values) dtype = values.dtype mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def convert_value(self, v) -> "TermValue": """ convert the expression that is in the term to something that is accepted by pytables """ def stringify(value): if self.encoding is not None: encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert("UTC") return TermValue(v, v.value, kind) elif kind == "timedelta64" or kind == "timedelta": v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) elif meta == "category": metadata = com.values_from_object(self.metadata) result = metadata.searchsorted(v, side="left") # result returns 0 if v is first element or if v is not in metadata # check that metadata contains v if not result and v not in metadata: result = -1 return TermValue(result, result, "integer") elif kind == "integer": v = int(float(v)) return TermValue(v, v, kind) elif kind == "float": v = float(v) return TermValue(v, v, kind) elif kind == "bool": if isinstance(v, str): v = not v.strip().lower() in [ "false", "f", "no", "n", "none", "0", "[]", "{}", "", ] else: v = bool(v) return TermValue(v, v, kind) elif isinstance(v, str): # string quoting return TermValue(v, stringify(v), "string") else: raise TypeError( "Cannot compare {v} of type {typ} to {kind} column".format( v=v, typ=type(v), kind=kind ) )
def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the variance along given axis while ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nanvar(s) 1.0 """ values = com.values_from_object(values) dtype = values.dtype if mask is None: mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values) ** 2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def get_value_maybe_box(self, series, key: Timedelta): values = self._engine.get_value(com.values_from_object(series), key) return com.maybe_box(self, values, series, key)
def nanskew(values, axis=None, skipna=True, mask=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1,np.nan, 1, 2]) >>> nanops.nanskew(s) 1.7320508075688787 """ values = com.values_from_object(values) if mask is None: mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error # # #18044 in _libs/windows.pyx calc_skew follow this behavior # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1)**0.5 / (count - 2)) * (m3 / m2**1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def get_value_maybe_box(self, series, key): if not isinstance(key, Timedelta): key = Timedelta(key) values = self._engine.get_value(com.values_from_object(series), key) return com.maybe_box(self, values, series, key)
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the variance along given axis while ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nanvar(s) 1.0 """ values = com.values_from_object(values) dtype = values.dtype if mask is None: mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def nankurt(values, axis=None, skipna=True, mask=None): """ Compute the sample excess kurtosis The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1,np.nan, 1, 3, 2]) >>> nanops.nankurt(s) -1.2892561983471076 """ values = com.values_from_object(values) if mask is None: mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted4 = adjusted2 ** 2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) with np.errstate(invalid='ignore', divide='ignore'): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2 ** 2 # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 with np.errstate(invalid='ignore', divide='ignore'): result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def _where_standard(cond, a, b): return np.where(com.values_from_object(cond), com.values_from_object(a), com.values_from_object(b))
def get_value_maybe_box(self, series, key): key = self._maybe_cast_for_get_loc(key) values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) return com.maybe_box(self, values, series, key)
def nankurt(values, axis=None, skipna=True, mask=None): """ Compute the sample excess kurtosis The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1,np.nan, 1, 3, 2]) >>> nanops.nankurt(s) -1.2892561983471076 """ values = com.values_from_object(values) if mask is None: mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted4 = adjusted2**2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) with np.errstate(invalid='ignore', divide='ignore'): adj = 3 * (count - 1)**2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 with np.errstate(invalid='ignore', divide='ignore'): result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def nanskew(values, axis=None, skipna=True, mask=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1,np.nan, 1, 2]) >>> nanops.nanskew(s) 1.7320508075688787 """ values = com.values_from_object(values) if mask is None: mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error # # #18044 in _libs/windows.pyx calc_skew follow this behavior # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def nankurt(values, axis=None, skipna=True): """ Compute the sample excess kurtosis. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. """ values = com.values_from_object(values) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted4 = adjusted2**2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) with np.errstate(invalid='ignore', divide='ignore'): adj = 3 * (count - 1)**2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 with np.errstate(invalid='ignore', divide='ignore'): result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value( dtype, fill_value=fill_value, fill_value_typ=fill_value_typ ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value