def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, ABCSeries)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) xrav = xrav[mask] yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): result[mask] = op(xrav, yrav) elif hasattr(x, 'size'): result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def test_datetime_other_units(self): idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in [ 'datetime64[D]', 'datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def test_isnull_datetime(self): self.assertFalse(isnull(datetime.now())) self.assertTrue(notnull(datetime.now())) idx = date_range('1/1/1990', periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notnull(idx), exp) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isnull(idx) self.assertTrue(mask[0]) exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) self.assert_numpy_array_equal(mask, exp) # GH 9129 pidx = idx.to_period(freq='M') mask = isnull(pidx) self.assertTrue(mask[0]) exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) self.assert_numpy_array_equal(mask, exp) mask = isnull(pidx[1:]) exp = np.zeros(len(mask), dtype=bool) self.assert_numpy_array_equal(mask, exp)
def test_timedelta_other_units(self): idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in [ 'timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', 'timedelta64[ns]' ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) with cf.option_context("mode.use_inf_as_null", False): assert notnull(np.inf) assert notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.all() with cf.option_context("mode.use_inf_as_null", True): assert not notnull(np.inf) assert not notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.sum() == 2 with cf.option_context("mode.use_inf_as_null", False): for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries()]: assert (isinstance(isnull(s), Series))
def test_period(self): idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(idx) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(idx, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def _reindex_columns(self, columns, method, copy, level, fill_value=None, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if notnull(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: raise NotImplementedError("'limit' argument is not supported") if method is not None: raise NotImplementedError("'method' argument is not supported") # TODO: fill value handling sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) return self._constructor( sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value).__finalize__(self)
def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') if min_periods is None: min_periods = 1 valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan return np.cov(a, b)[0, 1]
def make_sparse(arr, kind='block', fill_value=nan): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if isnull(fill_value): mask = notnull(arr) else: mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = np.arange(length, dtype=np.int32)[mask] index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index
def _validate(self): """ Verify that the IntervalIndex is valid. """ if self.closed not in _VALID_CLOSED: raise ValueError("invalid options for 'closed': %s" % self.closed) if len(self.left) != len(self.right): raise ValueError('left and right must have the same length') left_mask = notnull(self.left) right_mask = notnull(self.right) if not (left_mask == right_mask).all(): raise ValueError('missing values must be missing in the same ' 'location both left and right sides') if not (self.left[left_mask] <= self.right[left_mask]).all(): raise ValueError('left side of interval must be <= right side') self._mask = ~left_mask
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): if frame.columns._reference_duplicate_name(level): msg = ("Ambiguous reference to {0}. The column " "names are not unique.".format(level)) raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_labels = [lab.repeat(K) for lab in frame.index.labels] clev, clab = factorize(frame.columns) new_levels.append(clev) new_labels.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) labels = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, labels=labels, names=[frame.index.name, frame.columns.name], verify_integrity=False) new_values = frame.values.ravel() if dropna: mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] return Series(new_values, index=new_index)
def nancorr(a, b, method='pearson', min_periods=None): """ a, b: ndarrays """ if len(a) != len(b): raise AssertionError('Operands to nancorr must have same size') if min_periods is None: min_periods = 1 valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan f = get_corr_func(method) return f(a, b)
def na_op(x, y): try: result = op(x, y) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=x.dtype) if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notnull(xrav) result[mask] = op(np.array(list(xrav[mask])), y) if op == operator.ne: # pragma: no cover np.putmask(result, ~mask, True) else: np.putmask(result, ~mask, False) result = result.reshape(x.shape) return result
def _guess_time_format_for_array(arr): # Try to guess the format based on the first non-NaN element non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): element = arr[non_nan_elements[0]] for time_format in _time_formats: try: datetime.strptime(element, time_format) return time_format except ValueError: pass return None
def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) idx = date_range('1/1/1990', periods=20) assert (notnull(idx).all()) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isnull(idx) assert (mask[0]) assert (not mask[1:].any()) # GH 9129 pidx = idx.to_period(freq='M') mask = isnull(pidx) assert (mask[0]) assert (not mask[1:].any()) mask = isnull(pidx[1:]) assert (not mask.any())
def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: # TODO: might need to find_common_type here? result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=bool) if isinstance(y, np.ndarray): yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notnull(xrav) result[mask] = op(np.array(list(xrav[mask])), y) if op == operator.ne: # pragma: no cover np.putmask(result, ~mask, True) else: np.putmask(result, ~mask, False) result = result.reshape(x.shape) return result
def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], _values_from_object(y[mask])) elif isinstance(x, np.ndarray): result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) else: raise TypeError("{typ} cannot perform the operation " "{op}".format(typ=type(x).__name__, op=str_rep)) result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result
def test_datetime_other_units(self): idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def test_timedelta_other_units(self): idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(idx), exp) tm.assert_numpy_array_equal(notnull(idx), ~exp) tm.assert_numpy_array_equal(isnull(idx.values), exp) tm.assert_numpy_array_equal(notnull(idx.values), ~exp) for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', 'timedelta64[ns]']: values = idx.values.astype(dtype) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isnull(values), exp) tm.assert_numpy_array_equal(notnull(values), ~exp) exp = pd.Series([False, True, False]) s = pd.Series(values) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp) s = pd.Series(values, dtype=object) tm.assert_series_equal(isnull(s), exp) tm.assert_series_equal(notnull(s), ~exp)
def _attempt_YYYYMMDD(arg, errors): """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) Parameters ---------- arg : passed value errors : 'raise','ignore','coerce' """ def calc(carg): # calculate the actual result carg = carg.astype(object) parsed = lib.try_parse_year_month_day(carg / 10000, carg / 100 % 100, carg % 100) return tslib.array_to_datetime(parsed, errors=errors) def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslib.iNaT result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).\ astype('M8[ns]') return result # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) except: pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notnull(carg)) except: pass # string with NaN-like try: mask = ~lib.ismember(arg, tslib._nat_strings) return calc_with_mask(arg, mask) except: pass return None
def _attempt_YYYYMMDD(arg, errors): """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) Parameters ---------- arg : passed value errors : 'raise','ignore','coerce' """ def calc(carg): # calculate the actual result carg = carg.astype(object) parsed = lib.try_parse_year_month_day(carg / 10000, carg / 100 % 100, carg % 100) return tslib.array_to_datetime(parsed, errors=errors) def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslib.iNaT result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)). \ astype('M8[ns]') return result # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) except: pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notnull(carg)) except: pass # string with NaN-like try: mask = ~lib.ismember(arg, tslib._nat_strings) return calc_with_mask(arg, mask) except: pass return None
def cumsum(self, axis=0, *args, **kwargs): """ Cumulative sum of values. Preserves locations of NaN values Returns ------- cumsum : Series """ nv.validate_cumsum(args, kwargs) # TODO: gh-12855 - return a SparseArray here if notnull(self.fill_value): return self.to_dense().cumsum() # TODO: what if sp_values contains NaN?? return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, fill_value=self.fill_value)
def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isnull(fill_value): mask = notnull(arr) else: # For str arrays in NumPy 1.12.0, operator!= below isn't # element-wise but just returns False if fill_value is not str, # so cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index, fill_value
def lreshape(data, groups, dropna=True, label=None): """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame groups : dict {new_name : list_of_columns} dropna : boolean, default True Examples -------- >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) team hr year 0 Red Sox 514 2007 1 Yankees 573 2007 2 Red Sox 545 2008 3 Yankees 526 2008 Returns ------- reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) values = list(groups.values()) else: keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) for seq in values: if len(seq) != K: raise ValueError('All column lists must be same length') mdata = {} pivot_cols = [] for target, names in zip(keys, values): to_concat = [data[col].values for col in names] mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: mdata[col] = np.tile(data[col].values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) for c in pivot_cols: mask &= notnull(mdata[c]) if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) return DataFrame(mdata, columns=id_cols + pivot_cols)
def isnotnull(self): arr = SparseArray(notnull(self.values.sp_values), sparse_index=self.values.sp_index, fill_value=notnull(self.fill_value)) return self._constructor(arr, index=self.index).__finalize__(self)
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): """ Draw a matrix of scatter plots. Parameters ---------- frame : DataFrame alpha : float, optional amount of transparency applied figsize : (float,float), optional a tuple (width, height) in inches ax : Matplotlib axis object, optional grid : bool, optional setting this to True will show the grid diagonal : {'hist', 'kde'} pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plot in the diagonal marker : str, optional Matplotlib marker type, default '.' hist_kwds : other plotting keyword arguments To be passed to hist function density_kwds : other plotting keyword arguments To be passed to kernel density estimate plot range_padding : float, optional relative extension of axis range in x and y with respect to (x_max - x_min) or (y_max - y_min), default 0.05 kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ df = frame._get_numeric_data() n = df.columns.size naxes = n * n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = notnull(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # GH 14855 kwds.setdefault('edgecolors', 'none') boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel(b) ax.set_ylabel(a) if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) if len(df.columns) > 1: lim1 = boundaries_list[0] locs = axes[0][1].yaxis.get_majorticklocs() locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) lim0 = axes[0][0].get_ylim() adj = adj * (lim0[1] - lim0[0]) + lim0[0] axes[0][0].yaxis.set_ticks(adj) if np.all(locs == locs.astype(int)): # if all ticks are int locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes
def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
def get_median(x): mask = notnull(x) if not skipna and not mask.all(): return np.nan return algos.median(_values_from_object(x[mask]))
def lreshape(data, groups, dropna=True, label=None): """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame groups : dict {new_name : list_of_columns} dropna : boolean, default True Examples -------- >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) team hr year 0 Red Sox 514 2007 1 Yankees 573 2007 2 Red Sox 545 2008 3 Yankees 526 2008 Returns ------- reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) values = list(groups.values()) else: keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) for seq in values: if len(seq) != K: raise ValueError("All column lists must be same length") mdata = {} pivot_cols = [] for target, names in zip(keys, values): to_concat = [data[col].values for col in names] mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: mdata[col] = np.tile(data[col].values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) for c in pivot_cols: mask &= notnull(mdata[c]) if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) return DataFrame(mdata, columns=id_cols + pivot_cols)
def _valid_sp_values(self): sp_vals = self.sp_values mask = notnull(sp_vals) return sp_vals[mask]