def _combine_match_index(self, other, func, level=None, fill_value=None): new_data = {} if fill_value is not None: raise NotImplementedError("'fill_value' argument is not supported") if level is not None: raise NotImplementedError("'level' argument is not supported") new_index = self.index.union(other.index) this = self if self.index is not new_index: this = self.reindex(new_index) if other.index is not new_index: other = other.reindex(new_index) for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator if isnull(other.fill_value) or isnull(self.default_fill_value): fill_value = np.nan else: fill_value = func(np.float64(self.default_fill_value), np.float64(other.fill_value)) return self._constructor( new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self)
def test_timedelta_other_units(self): idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in [ 'timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', 'timedelta64[ns]' ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def test_isnull_datetime(self): self.assertFalse(isnull(datetime.now())) self.assertTrue(notnull(datetime.now())) idx = date_range('1/1/1990', periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notnull(idx), exp) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isnull(idx) self.assertTrue(mask[0]) exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) self.assert_numpy_array_equal(mask, exp) # GH 9129 pidx = idx.to_period(freq='M') mask = isnull(pidx) self.assertTrue(mask[0]) exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) self.assert_numpy_array_equal(mask, exp) mask = isnull(pidx[1:]) exp = np.zeros(len(mask), dtype=bool) self.assert_numpy_array_equal(mask, exp)
def test_datetime_other_units(self): idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in [ 'datetime64[D]', 'datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def test_isnull_nat(): result = isnull([NaT]) exp = np.array([True]) assert (np.array_equal(result, exp)) result = isnull(np.array([NaT], dtype=object)) exp = np.array([True]) assert (np.array_equal(result, exp))
def test_isnull_nat(self): result = isnull([NaT]) exp = np.array([True]) tm.assert_numpy_array_equal(result, exp) result = isnull(np.array([NaT], dtype=object)) exp = np.array([True]) tm.assert_numpy_array_equal(result, exp)
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert (isinstance(isnull(s), Series)) # frame for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) self.assertTrue(isnull(np.NaN)) self.assertTrue(float('nan')) self.assertFalse(isnull(np.inf)) self.assertFalse(isnull(-np.inf)) # series for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: self.assertIsInstance(isnull(s), Series) # frame for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel with catch_warnings(record=True): for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d with catch_warnings(record=True): for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not isscalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if isscalar(y) and isnull(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: mask = isnull(x) | isnull(y) y = y.view('i8') x = x.view('i8') try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def f(x, y): xmask = isnull(x) ymask = isnull(y) mask = xmask | ymask result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype('O') np.putmask(result, mask, np.nan) return result
def test_period(self): idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(idx) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(idx, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def f(x, y): xmask = isnull(x) ymask = isnull(y) mask = xmask | ymask with np.errstate(all='ignore'): result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype('O') np.putmask(result, mask, np.nan) return result
def test_isnull_lists(): result = isnull([[False]]) exp = np.array([[False]]) assert (np.array_equal(result, exp)) result = isnull([[1], [2]]) exp = np.array([[False], [False]]) assert (np.array_equal(result, exp)) # list of strings / unicode result = isnull(['foo', 'bar']) assert (not result.any()) result = isnull([u('foo'), u('bar')]) assert (not result.any())
def na_op(x, y): try: result = op(x, y) except TypeError: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (np.ndarray, ABCSeries)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: x = _ensure_object(x) y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: try: # let null fall thru if not isnull(y): y = bool(y) result = lib.scalar_binop(x, y, op) except: raise TypeError("cannot compare a dtyped [{0}] array with " "a scalar of type [{1}]".format( x.dtype, type(y).__name__)) return result
def nanprod(values, axis=None, skipna=True): mask = isnull(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) return _maybe_null_out(result, axis, mask)
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): return items.argsort(ascending=ascending) items = np.asanyarray(items) idx = np.arange(len(items)) mask = isnull(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) with cf.option_context("mode.use_inf_as_null", False): assert notnull(np.inf) assert notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.all() with cf.option_context("mode.use_inf_as_null", True): assert not notnull(np.inf) assert not notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.sum() == 2 with cf.option_context("mode.use_inf_as_null", False): for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert (isinstance(isnull(s), Series))
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isnull(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def test_empty_object(self): for shape in [(4, 0), (4, )]: arr = np.empty(shape=shape, dtype=object) result = isnull(arr) expected = np.ones(shape=shape, dtype=bool) tm.assert_numpy_array_equal(result, expected)
def shift(self, periods, freq=None, axis=0): if periods == 0: return self.copy() # no special handling of fill values yet if not isnull(self.fill_value): shifted = self.to_dense().shift(periods, freq=freq, axis=axis) return shifted.to_sparse(fill_value=self.fill_value, kind=self.kind) if freq is not None: return self._constructor( self.sp_values, sparse_index=self.sp_index, index=self.index.shift(periods, freq), fill_value=self.fill_value).__finalize__(self) int_index = self.sp_index.to_int_index() new_indices = int_index.indices + periods start, end = new_indices.searchsorted([0, int_index.length]) new_indices = new_indices[start:end] new_sp_index = _make_index(len(self), new_indices, self.sp_index) arr = self.values._simple_new(self.sp_values[start:end].copy(), new_sp_index, fill_value=np.nan) return self._constructor(arr, index=self.index).__finalize__(self)
def make_sparse(arr, kind='block', fill_value=nan): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if isnull(fill_value): mask = notnull(arr) else: mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = np.arange(length, dtype=np.int32)[mask] index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is tslib.NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) if isnull(other): result.fill(nat_result) else: if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == tslib.iNaT else: o_mask = other.view('i8') == tslib.iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label Returns ------- loc : int """ if isnull(key): key = tslib.NaT if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below tolerance = self._convert_tolerance(tolerance) if _is_convertible_to_td(key): key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) try: return Index.get_loc(self, key, method, tolerance) except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) except (TypeError, KeyError, ValueError): pass try: stamp = Timedelta(key) return Index.get_loc(self, stamp, method, tolerance) except (KeyError, ValueError): raise KeyError(key)
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def nankurt(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted4 = adjusted2 ** 2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 result = numer / denom - adj # floating point error numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def test_isnull_lists(self): result = isnull([[False]]) exp = np.array([[False]]) tm.assert_numpy_array_equal(result, exp) result = isnull([[1], [2]]) exp = np.array([[False], [False]]) tm.assert_numpy_array_equal(result, exp) # list of strings / unicode result = isnull(['foo', 'bar']) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp) result = isnull([u('foo'), u('bar')]) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp)
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): x_is_series = isinstance(x, Series) series_index = None if x_is_series: series_index = x.index if name is None: name = x.name x = np.asarray(x) side = "left" if right else "right" ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError("Bin edges must be unique: %s" % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError("Bin labels must be one fewer than " "the number of bin edges") levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if x_is_series: fac = Series(fac, index=series_index, name=name) if not retbins: return fac return fac, bins
def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ if not isinstance(values_to_mask, (list, np.ndarray)): values_to_mask = [values_to_mask] try: values_to_mask = np.array(values_to_mask, dtype=arr.dtype) except Exception: values_to_mask = np.array(values_to_mask, dtype=object) na_mask = isnull(values_to_mask) nonna = values_to_mask[~na_mask] mask = None for x in nonna: if mask is None: # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask = False else: mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): mask |= False else: mask |= arr == x if na_mask.any(): if mask is None: mask = isnull(arr) else: mask |= isnull(arr) return mask
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period_type = (is_period_dtype(values) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if needs_i8_conversion(dtype) or is_period_type: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period_type: # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) keys, counts = htable.value_count_object(values, dropna) mask = isnull(values) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count)
def test_isnull_numpy_nat(self): arr = np.array([ NaT, np.datetime64('NaT'), np.timedelta64('NaT'), np.datetime64('NaT', 's') ]) result = isnull(arr) expected = np.array([True] * 4) tm.assert_numpy_array_equal(result, expected)
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def wrapper(self, other): if isinstance(other, Series): if not isinstance(other, SparseSeries): other = other.to_sparse(fill_value=self.fill_value) return _sparse_series_op(self, other, op, name) elif isinstance(other, DataFrame): return NotImplemented elif is_scalar(other): if isnull(other) or isnull(self.fill_value): new_fill_value = np.nan else: new_fill_value = op(np.float64(self.fill_value), np.float64(other)) return SparseSeries(op(self.sp_values, other), index=self.index, sparse_index=self.sp_index, fill_value=new_fill_value, name=self.name) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other))
def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1)**0.5 / (count - 2)) * (m3 / m2**1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def test_0d_array(self): self.assertTrue(isnull(np.array(np.nan))) self.assertFalse(isnull(np.array(0.0))) self.assertFalse(isnull(np.array(0))) # test object dtype self.assertTrue(isnull(np.array(np.nan, dtype=object))) self.assertFalse(isnull(np.array(0.0, dtype=object))) self.assertFalse(isnull(np.array(0, dtype=object)))