def test_isna_datetime(self): assert not isna(datetime.now()) assert notna(datetime.now()) idx = date_range('1/1/1990', periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notna(idx), exp) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isna(idx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) tm.assert_numpy_array_equal(mask, exp) # GH 9129 pidx = idx.to_period(freq='M') mask = isna(pidx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) tm.assert_numpy_array_equal(mask, exp) mask = isna(pidx[1:]) exp = np.zeros(len(mask), dtype=bool) tm.assert_numpy_array_equal(mask, exp)
def test_period(self): idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(idx) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) s = pd.Series(idx, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp)
def _validate(self): """ Verify that the IntervalIndex is valid. """ if self.closed not in _VALID_CLOSED: raise ValueError("invalid options for 'closed': %s" % self.closed) if len(self.left) != len(self.right): raise ValueError('left and right must have the same length') left_mask = notna(self.left) right_mask = notna(self.right) if not (left_mask == right_mask).all(): raise ValueError('missing values must be missing in the same ' 'location both left and right sides') if not (self.left[left_mask] <= self.right[left_mask]).all(): raise ValueError('left side of interval must be <= right side') self._mask = ~left_mask
def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') if min_periods is None: min_periods = 1 valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan return np.cov(a, b)[0, 1]
def na_op(x, y): import pandas.core.computation.expressions as expressions try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, ABCSeries)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notna(xrav) & notna(yrav) xrav = xrav[mask] if yrav.shape != mask.shape: # FIXME: GH#5284, GH#5035, GH#19448 # Without specifically raising here we get mismatched # errors in Py3 (TypeError) vs Py2 (ValueError) raise ValueError('Cannot broadcast operands together.') yrav = yrav[mask] if xrav.size: with np.errstate(all='ignore'): result[mask] = op(xrav, yrav) elif isinstance(x, np.ndarray): # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) xrav = xrav[mask] if xrav.size: with np.errstate(all='ignore'): result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): if frame.columns._reference_duplicate_name(level): msg = ("Ambiguous reference to {level}. The column " "names are not unique.".format(level=level)) raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_labels = [lab.repeat(K) for lab in frame.index.labels] clev, clab = factorize(frame.columns) new_levels.append(clev) new_labels.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) labels = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, labels=labels, names=[frame.index.name, frame.columns.name], verify_integrity=False) new_values = frame.values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value dtype : np.dtype, optional copy : bool, default False Returns ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isna(fill_value): mask = notna(arr) else: # For str arrays in NumPy 1.12.0, operator!= below isn't # element-wise but just returns False if fill_value is not str, # so cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) if is_object_dtype(arr.dtype): # element-wise equality check method in numpy doesn't treat # each element type, eg. 0, 0.0, and False are treated as # same. So we have to check the both of its type and value. mask = splib.make_mask_object_ndarray(arr, fill_value) else: mask = arr != fill_value length = len(arr) if length != len(mask): # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) # TODO: copy return sparsified_values, index, fill_value
def na_op(x, y): import pandas.core.computation.expressions as expressions try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, ABCSeries)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notna(xrav) & notna(yrav) xrav = xrav[mask] # we may need to manually # broadcast a 1 element array if yrav.shape != mask.shape: yrav = np.empty(mask.shape, dtype=yrav.dtype) yrav.fill(yrav.item()) yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): with np.errstate(all='ignore'): result[mask] = op(xrav, yrav) elif hasattr(x, 'size'): result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) xrav = xrav[mask] if np.prod(xrav.shape): with np.errstate(all='ignore'): result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def nancorr(a, b, method='pearson', min_periods=None): """ a, b: ndarrays """ if len(a) != len(b): raise AssertionError('Operands to nancorr must have same size') if min_periods is None: min_periods = 1 valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan f = get_corr_func(method) return f(a, b)
def na_op(x, y): import pandas.core.computation.expressions as expressions try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) mask = notna(x) & notna(y) result[mask] = op(x[mask], com._values_from_object(y[mask])) else: assert isinstance(x, np.ndarray) result = np.empty(len(x), dtype=x.dtype) mask = notna(x) result[mask] = op(x[mask], y) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def _guess_time_format_for_array(arr): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] if len(non_nan_elements): element = arr[non_nan_elements[0]] for time_format in _time_formats: try: datetime.strptime(element, time_format) return time_format except ValueError: pass return None
def _validate(self): """Verify that the IntervalArray is valid. Checks that * closed is valid * left and right match lengths * left and right have the same missing values * left is always below right """ if self.closed not in _VALID_CLOSED: raise ValueError("invalid option for 'closed': {closed}" .format(closed=self.closed)) if len(self.left) != len(self.right): raise ValueError('left and right must have the same length') left_mask = notna(self.left) right_mask = notna(self.right) if not (left_mask == right_mask).all(): raise ValueError('missing values must be missing in the same ' 'location both left and right sides') if not (self.left[left_mask] <= self.right[left_mask]).all(): raise ValueError('left side of interval must be <= right side')
def _attempt_YYYYMMDD(arg, errors): """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) Parameters ---------- arg : passed value errors : 'raise','ignore','coerce' """ def calc(carg): # calculate the actual result carg = carg.astype(object) parsed = parsing.try_parse_year_month_day(carg / 10000, carg / 100 % 100, carg % 100) return tslib.array_to_datetime(parsed, errors=errors)[0] def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslibs.iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) result[mask] = masked_result.astype('M8[ns]') return result # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) except ValueError: pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) except ValueError: pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) except ValueError: pass return None
def na_op(x, y): try: with np.errstate(invalid='ignore'): result = op(x, y) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=bool) if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notna(xrav) & notna(yrav) result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notna(xrav) result[mask] = op(np.array(list(xrav[mask])), y) if op == operator.ne: # pragma: no cover np.putmask(result, ~mask, True) else: np.putmask(result, ~mask, False) result = result.reshape(x.shape) return result
def na_op(x, y): import pandas.core.computation.expressions as expressions try: result = expressions.evaluate(op, str_rep, x, y) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=bool) if isinstance(y, np.ndarray): yrav = y.ravel() mask = notna(xrav) & notna(yrav) result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notna(xrav) result[mask] = op(np.array(list(xrav[mask])), y) if op == operator.ne: # pragma: no cover np.putmask(result, ~mask, True) else: np.putmask(result, ~mask, False) result = result.reshape(x.shape) return result
def test_timedelta_other_units(self): idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', 'timedelta64[ns]']: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(values), exp) tm.assert_numpy_array_equal(notna(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp)
def test_datetime_other_units(self): idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(values), exp) tm.assert_numpy_array_equal(notna(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp)
def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isna(fill_value): mask = notna(arr) else: # For str arrays in NumPy 1.12.0, operator!= below isn't # element-wise but just returns False if fill_value is not str, # so cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index, fill_value
def _reindex_columns(self, columns, method, copy, level, fill_value=None, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if notna(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: raise NotImplementedError("'limit' argument is not supported") if method is not None: raise NotImplementedError("'method' argument is not supported") # TODO: fill value handling sdict = {k: v for k, v in compat.iteritems(self) if k in columns} return self._constructor( sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value).__finalize__(self)
def take(self, indexer, allow_fill=False, fill_value=None): from pandas.api.extensions import take # we always fill with 1 internally # to avoid upcasting data_fill_value = 1 if isna(fill_value) else fill_value result = take(self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill) mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) # if we are filling # we only fill where the indexer is null # not existing missing values # TODO(jreback) what if we have a non-na float as a fill value? if allow_fill and notna(fill_value): fill_mask = np.asarray(indexer) == -1 result[fill_mask] = fill_value mask = mask ^ fill_mask return type(self)(result, mask, copy=False)
def _add_timedeltalike_scalar(self, other): """ Parameters ---------- other : timedelta, Tick, np.timedelta64 Returns ------- result : ndarray[int64] """ assert isinstance(self.freq, Tick) # checked by calling function assert isinstance(other, (timedelta, np.timedelta64, Tick)) if notna(other): # special handling for np.timedelta64("NaT"), avoid calling # _check_timedeltalike_freq_compat as that would raise TypeError other = self._check_timedeltalike_freq_compat(other) # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here # is an integer, delta_to_nanoseconds will return it unchanged. ordinals = super(PeriodArray, self)._add_timedeltalike_scalar(other) return ordinals
def _reduce(self, name, skipna=True, **kwargs): data = self._data mask = self._mask # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype('float64') data[mask] = self._na_value op = getattr(nanops, 'nan' + name) result = op(data, axis=0, skipna=skipna, mask=mask) # if we have a boolean op, don't coerce if name in ['any', 'all']: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible elif name in ['sum', 'min', 'max', 'prod'] and notna(result): int_result = int(result) if int_result == result: result = int_result return result
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): """ Draw a matrix of scatter plots. Parameters ---------- frame : DataFrame alpha : float, optional amount of transparency applied figsize : (float,float), optional a tuple (width, height) in inches ax : Matplotlib axis object, optional grid : bool, optional setting this to True will show the grid diagonal : {'hist', 'kde'} pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plot in the diagonal marker : str, optional Matplotlib marker type, default '.' hist_kwds : other plotting keyword arguments To be passed to hist function density_kwds : other plotting keyword arguments To be passed to kernel density estimate plot range_padding : float, optional relative extension of axis range in x and y with respect to (x_max - x_min) or (y_max - y_min), default 0.05 kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault('edgecolors', 'none') boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes
def notna(self): arr = SparseArray(notna(self.values.sp_values), sparse_index=self.values.sp_index, fill_value=notna(self.fill_value)) return self._constructor(arr, index=self.index).__finalize__(self)
def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. Parameters ---------- indices : sequence of integers Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : Interval or NA, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. axis : any, default None Present for compat with IntervalIndex; does nothing. Returns ------- IntervalArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. """ nv.validate_take(tuple(), kwargs) fill_left = fill_right = fill_value if allow_fill: if fill_value is None: fill_left = fill_right = self.left._na_value elif is_interval(fill_value): self._check_closed_matches(fill_value, name="fill_value") fill_left, fill_right = fill_value.left, fill_value.right elif not is_scalar(fill_value) and notna(fill_value): msg = ("'IntervalArray.fillna' only supports filling with a " "'scalar pandas.Interval or NA'. " f"Got a '{type(fill_value).__name__}' instead.") raise ValueError(msg) left_take = take(self.left, indices, allow_fill=allow_fill, fill_value=fill_left) right_take = take(self.right, indices, allow_fill=allow_fill, fill_value=fill_right) return self._shallow_copy(left_take, right_take)
def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan return np.nanmedian(x[mask])
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes.values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type([ col._values for _, col in frame.iteritems() ]) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() else: # non-homogeneous new_values = frame.values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False, ) if not frame.empty and frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( [col._values for _, col in frame.items()]) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame._values.ravel() else: # non-homogeneous new_values = frame._values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode self.encoding = encoding self.compression = compression if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and date_format is not None): self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def scatter_matrix( frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal="hist", marker=".", density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds ): df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notna(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault("edgecolors", "none") boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in enumerate(df.columns): for j, b in enumerate(df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == "hist": ax.hist(values, **hist_kwds) elif diagonal in ("kde", "density"): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter( df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds ) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes
def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. Parameters ---------- indices : sequence of integers Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : Interval or NA, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. axis : any, default None Present for compat with IntervalIndex; does nothing. Returns ------- IntervalArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. """ from pandas.core.algorithms import take nv.validate_take(tuple(), kwargs) fill_left = fill_right = fill_value if allow_fill: if fill_value is None: fill_left = fill_right = self.left._na_value elif is_interval(fill_value): self._check_closed_matches(fill_value, name='fill_value') fill_left, fill_right = fill_value.left, fill_value.right elif not is_scalar(fill_value) and notna(fill_value): msg = ("'IntervalArray.fillna' only supports filling with a " "'scalar pandas.Interval or NA'. Got a '{}' instead." .format(type(fill_value).__name__)) raise ValueError(msg) left_take = take(self.left, indices, allow_fill=allow_fill, fill_value=fill_left) right_take = take(self.right, indices, allow_fill=allow_fill, fill_value=fill_right) return self._shallow_copy(left_take, right_take)
def __init__( self, obj, path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, cols=None, header: Union[bool, Sequence[Hashable]] = True, index: bool = True, index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, mode: str = "w", encoding: Optional[str] = None, compression: Union[str, Mapping[str, str], None] = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, quotechar='"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", ): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() # Extract compression mode as given, if dict compression, self.compression_args = get_compression_method( compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode if encoding is None: encoding = "utf-8" self.encoding = encoding self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError( "cannot specify cols with a MultiIndex on the columns") if cols is not None: if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting, ) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting, ) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list ncols = self.obj.shape[-1] self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): from pandas import Index self.data_index = Index([ x.strftime(date_format) if notna(x) else "" for x in self.data_index ]) self.nlevels = getattr(self.data_index, "nlevels", 1) if not index: self.nlevels = 0
def lreshape(data, groups, dropna=True, label=None): """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame groups : dict {new_name : list_of_columns} dropna : boolean, default True Examples -------- >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) team year hr 0 Red Sox 2007 514 1 Yankees 2007 573 2 Red Sox 2008 545 3 Yankees 2008 526 Returns ------- reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) values = list(groups.values()) else: keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) for seq in values: if len(seq) != K: raise ValueError('All column lists must be same length') mdata = {} pivot_cols = [] for target, names in zip(keys, values): to_concat = [data[col].values for col in names] import pandas.core.dtypes.concat as _concat mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: mdata[col] = np.tile(data[col].values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) for c in pivot_cols: mask &= notna(mdata[c]) if not mask.all(): mdata = {k: v[mask] for k, v in compat.iteritems(mdata)} return data._constructor(mdata, columns=id_cols + pivot_cols)
def _masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). Parameters ---------- x : np.ndarray y : np.ndarray, Series, Index op : binary operator """ # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes # the logic valid for both Series and DataFrame ops. xrav = x.ravel() assert isinstance(x, np.ndarray), type(x) if isinstance(y, np.ndarray): dtype = find_common_type([x.dtype, y.dtype]) # error: Argument "dtype" to "empty" has incompatible type # "Union[dtype, ExtensionDtype]"; expected "Union[dtype, None, type, # _SupportsDtype, str, Tuple[Any, int], Tuple[Any, Union[int, # Sequence[int]]], List[Any], _DtypeDict, Tuple[Any, Any]]" result = np.empty(x.size, dtype=dtype) # type: ignore[arg-type] if len(x) != len(y): raise ValueError(x.shape, y.shape) else: ymask = notna(y) # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() mask = notna(xrav) & ymask.ravel() # See GH#5284, GH#5035, GH#19448 for historical reference if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], yrav[mask]) else: if not is_scalar(y): raise TypeError( f"Cannot broadcast np.ndarray with operand of type { type(y) }" ) # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. if op is pow: mask = np.where(x == 1, False, mask) elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) result = maybe_upcast_putmask(result, ~mask) result = result.reshape(x.shape) # 2D compat return result
def _valid_sp_values(self): sp_vals = self.sp_values mask = notna(sp_vals) return sp_vals[mask]