def ols_results(self): """ Returns the results of the regressions: x_1 ~ L(X) x_2 ~ L(X) ... x_k ~ L(X) where X = [x_1, x_2, ..., x_k] and L(X) represents the columns of X lagged 1, 2, ..., n lags (n is the user-provided number of lags). Returns ------- dict """ from pandas.stats.api import ols d = {} for i in range(1, 1 + self._p): for col, series in compat.iteritems(self._lagged_data[i]): d[_make_param_name(i, col)] = series result = dict([(col, ols(y=y, x=d, intercept=self._intercept)) for col, y in compat.iteritems(self._data)]) return result
def stack_sparse_frame(frame): """ Only makes sense when fill_value is NaN """ lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] nobs = sum(lengths) # this is pretty fast minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] # TODO: Figure out whether this can be reached. # I think this currently can't be reached because you can't build a SparseDataFrame # with a non-np.NaN fill value (fails earlier). for _, series in compat.iteritems(frame): if not np.isnan(series.fill_value): raise TypeError('This routine assumes NaN fill value') int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) major_labels = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], labels=[major_labels, minor_labels], verify_integrity=False) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) return lp.sortlevel(level=0)
def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in compat.iteritems(df)} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in compat.iteritems(df)} assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) pytest.raises(ValueError, df.replace, to_rep, values[1:])
def test_reindex(self): newFrame = self.frame.reindex(self.ts1.index) for col in newFrame.columns: for idx, val in compat.iteritems(newFrame[col]): if idx in self.frame.index: if np.isnan(val): self.assertTrue(np.isnan(self.frame[col][idx])) else: self.assertEqual(val, self.frame[col][idx]) else: self.assertTrue(np.isnan(val)) for col, series in compat.iteritems(newFrame): self.assertTrue(tm.equalContents(series.index, newFrame.index)) emptyFrame = self.frame.reindex(Index([])) self.assertEqual(len(emptyFrame.index), 0) # Cython code should be unit-tested directly nonContigFrame = self.frame.reindex(self.ts1.index[::2]) for col in nonContigFrame.columns: for idx, val in compat.iteritems(nonContigFrame[col]): if idx in self.frame.index: if np.isnan(val): self.assertTrue(np.isnan(self.frame[col][idx])) else: self.assertEqual(val, self.frame[col][idx]) else: self.assertTrue(np.isnan(val)) for col, series in compat.iteritems(nonContigFrame): self.assertTrue(tm.equalContents(series.index, nonContigFrame.index)) # corner cases # Same index, copies values but not index if copy=False newFrame = self.frame.reindex(self.frame.index, copy=False) self.assertIs(newFrame.index, self.frame.index) # length zero newFrame = self.frame.reindex([]) self.assertTrue(newFrame.empty) self.assertEqual(len(newFrame.columns), len(self.frame.columns)) # length zero with columns reindexed with non-empty index newFrame = self.frame.reindex([]) newFrame = newFrame.reindex(self.frame.index) self.assertEqual(len(newFrame.index), len(self.frame.index)) self.assertEqual(len(newFrame.columns), len(self.frame.columns)) # pass non-Index newFrame = self.frame.reindex(list(self.ts1.index)) self.assert_index_equal(newFrame.index, self.ts1.index) # copy with no axes result = self.frame.reindex() assert_frame_equal(result, self.frame) self.assertFalse(result is self.frame)
def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): from pandas.core.panel import _get_combined_index output = {} for item, df in compat.iteritems(frames): if not isinstance(df, SparseDataFrame): df = SparseDataFrame(df, default_kind=kind, default_fill_value=fill_value) output[item] = df if index is None: all_indexes = [df.index for df in output.values()] index = _get_combined_index(all_indexes) if columns is None: all_columns = [df.columns for df in output.values()] columns = _get_combined_index(all_columns) index = _ensure_index(index) columns = _ensure_index(columns) for item, df in compat.iteritems(output): if not (df.index.equals(index) and df.columns.equals(columns)): output[item] = df.reindex(index=index, columns=columns) return output, index, columns
def _update_method_mapper(cls, mapper): """Attach cls._method_mapper to passed mapper""" if cls._method_mapper is None: return mapper for key, class_dict in compat.iteritems(cls._method_mapper): # mapping method_name to actual class method class_dict = {k: getattr(cls, m) for k, m in compat.iteritems(class_dict)} mapper[key] = dict(mapper[key], **class_dict) return mapper
def test_iteritems(self): for idx, val in compat.iteritems(self.series): assert val == self.series[idx] for idx, val in compat.iteritems(self.ts): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.iteritems(), 'reverse')
def test_iteritems(self): for idx, val in compat.iteritems(self.series): self.assertEqual(val, self.series[idx]) for idx, val in compat.iteritems(self.ts): self.assertEqual(val, self.ts[idx]) # assert is lazy (genrators don't define reverse, lists do) self.assertFalse(hasattr(self.series.iteritems(), 'reverse'))
def int_frame(): """ Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] """ df = DataFrame({k: v.astype(int) for k, v in compat.iteritems(tm.getSeriesData())}) # force these all to int64 to avoid platform testing issues return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) result0 = df.apply(Series.describe, axis=0) expected0 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df)), columns=df.columns) assert_frame_equal(result0, expected0) result1 = df.apply(Series.describe, axis=1) expected1 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df.T)), columns=df.index).T assert_frame_equal(result1, expected1)
def test_to_dict(self, mapping): test_data = { 'A': {'1': 1, '2': 2}, 'B': {'1': '1', '2': '2', '3': '3'}, } # GH16122 recons_data = DataFrame(test_data).to_dict(into=mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("l", mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][int(k2) - 1]) recons_data = DataFrame(test_data).to_dict("s", mapping) for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("sp", mapping) expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]} tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r", mapping) expected_records = [{'A': 1.0, 'B': '1'}, {'A': 2.0, 'B': '2'}, {'A': np.nan, 'B': '3'}] assert isinstance(recons_data, list) assert (len(recons_data) == 3) for l, r in zip(recons_data, expected_records): tm.assert_dict_equal(l, r) # GH10844 recons_data = DataFrame(test_data).to_dict("i") for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k2][k]) df = DataFrame(test_data) df['duped'] = df[df.columns[0]] recons_data = df.to_dict("i") comp_data = test_data.copy() comp_data['duped'] = comp_data[df.columns[0]] for k, v in compat.iteritems(comp_data): for k2, v2 in compat.iteritems(v): assert (v2 == recons_data[k2][k])
def most_common(self, n=None): """List the n most common elements and their counts from the most common to the least. If n is None, then list all element counts. >>> Counter('abcdeabcdabcaba').most_common(3) [('a', 5), ('b', 4), ('c', 3)] """ # Emulate Bag.sortedByCount from Smalltalk if n is None: return sorted(compat.iteritems(self), key=_itemgetter(1), reverse=True) return _heapq.nlargest(n, compat.iteritems(self), key=_itemgetter(1))
def percentileRank(frame, column=None, kind='mean'): """ Return score at percentile for each point in time (cross-section) Parameters ---------- frame: DataFrame column: string or Series, optional Column name or specific Series to compute percentiles for. If not provided, percentiles are computed for all values at each point in time. Note that this can take a LONG time. kind: {'rank', 'weak', 'strict', 'mean'}, optional This optional parameter specifies the interpretation of the resulting score: - "rank": Average percentage ranking of score. In case of multiple matches, average the percentage rankings of all matching scores. - "weak": This kind corresponds to the definition of a cumulative distribution function. A percentileofscore of 80% means that 80% of values are less than or equal to the provided score. - "strict": Similar to "weak", except that only values that are strictly less than the given score are counted. - "mean": The average of the "weak" and "strict" scores, often used in testing. See http://en.wikipedia.org/wiki/Percentile_rank Returns ------- TimeSeries or DataFrame, depending on input """ from pandas.compat.scipy import percentileofscore fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind) results = {} framet = frame.T if column is not None: if isinstance(column, Series): for date, xs in compat.iteritems(frame.T): results[date] = fun(xs, column.get(date, NaN)) else: for date, xs in compat.iteritems(frame.T): results[date] = fun(xs, xs[column]) results = Series(results) else: for column in frame.columns: for date, xs in compat.iteritems(framet): results.setdefault(date, {})[column] = fun(xs, xs[column]) results = DataFrame(results).T return results
def test_map(self): index, data = tm.getMixedTypeDict() source = Series(data['B'], index=data['C']) target = Series(data['C'][:4], index=data['D'][:4]) merged = target.map(source) for k, v in compat.iteritems(merged): assert v == source[target[k]] # input could be a dict merged = target.map(source.to_dict()) for k, v in compat.iteritems(merged): assert v == source[target[k]] # function result = self.ts.map(lambda x: x * 2) tm.assert_series_equal(result, self.ts * 2) # GH 10324 a = Series([1, 2, 3, 4]) b = Series(["even", "odd", "even", "odd"], dtype="category") c = Series(["even", "odd", "even", "odd"]) exp = Series(["odd", "even", "odd", np.nan], dtype="category") tm.assert_series_equal(a.map(b), exp) exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) a = Series(['a', 'b', 'c', 'd']) b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) a = Series(['a', 'b', 'c', 'd']) b = Series(['B', 'C', 'D', 'E'], dtype='category', index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], categories=['B', 'C', 'D', 'E'])) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 'B', 'C', 'D']) tm.assert_series_equal(a.map(c), exp)
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: keys = com.dict_keys_to_ordered_list(data) columns = Index(keys) if index is None: index = extract_index(list(data.values())) def sp_maker(x): return SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = v.copy() else: if isinstance(v, dict): v = [v.get(i, np.nan) for i in index] v = sp_maker(v) if index is not None and len(v) != len(index): msg = "Length of passed values is {}, index implies {}" raise ValueError(msg.format(len(v), len(index))) sdict[k] = v if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_arr = np.empty(len(index), dtype='float64') nan_arr.fill(np.nan) nan_arr = SparseArray(nan_arr, kind=self._default_kind, fill_value=self._default_fill_value, copy=False) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index)
def test_groups(self, df): grouped = df.groupby(['A']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): assert (df.loc[v]['A'] == k).all() grouped = df.groupby(['A', 'B']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): assert (df.loc[v]['A'] == k[0]).all() assert (df.loc[v]['B'] == k[1]).all()
def f(values, axis=None, skipna=True, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v try: if self.zero_value is not None and values.size == 0: if values.ndim == 1: return 0 else: result_shape = (values.shape[:axis] + values.shape[axis + 1:]) result = np.empty(result_shape) result.fill(0) return result if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func # twice :( if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) except Exception: result = alt(values, axis=axis, skipna=skipna, **kwds) return result
def apply(self, func, axis=0, broadcast=False): """ Analogous to DataFrame.apply, for SparseDataFrame Parameters ---------- func : function Function to apply to each column axis : {0, 1} broadcast : bool, default False For aggregation functions, return object of same size with values propagated Returns ------- applied : Series or SparseDataFrame """ if not len(self.columns): return self if isinstance(func, np.ufunc): new_series = {} for k, v in compat.iteritems(self): applied = func(v) applied.fill_value = func(applied.fill_value) new_series[k] = applied return SparseDataFrame(new_series, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) else: if not broadcast: return self._apply_standard(func, axis) else: return self._apply_broadcast(func, axis)
def test_delete_slice(self): idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') # prserve freq expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', name='idx') expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', name='idx') # reset freq to None expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', '7 d', '8 d', '9 d', '10d'], freq=None, name='idx') cases = {(0, 1, 2): expected_0_2, (7, 8, 9): expected_7_9, (3, 4, 5): expected_3_5} for n, expected in compat.iteritems(cases): result = idx.delete(n) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq result = idx.delete(slice(n[0], n[-1] + 1)) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq
def _combine_match_index(self, other, func, level=None, fill_value=None): new_data = {} if fill_value is not None: raise NotImplementedError if level is not None: raise NotImplementedError new_index = self.index.union(other.index) this = self if self.index is not new_index: this = self.reindex(new_index) if other.index is not new_index: other = other.reindex(new_index) for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator if isnull(other.fill_value) or isnull(self.default_fill_value): fill_value = np.nan else: fill_value = func(np.float64(self.default_fill_value), np.float64(other.fill_value)) return self._constructor(new_data, index=new_index, columns=self.columns, default_fill_value=fill_value, fill_value=self.default_fill_value).__finalize__(self)
def _combine_const(self, other, func): new_data = {} for col, series in compat.iteritems(self): new_data[col] = func(series, other) return self._constructor(data=new_data, index=self.index, columns=self.columns)
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) indexer = com._ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in compat.iteritems(self): values = series.values new = values.take(indexer) if need_mask: np.putmask(new, mask, fill_value) new_series[col] = new return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value)
def test_delete(self): idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') # prserve freq expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', name='idx') expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', name='idx') # reset freq to None expected_1 = TimedeltaIndex( ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') cases = {0: expected_0, -5: expected_0, -1: expected_4, 4: expected_4, 1: expected_1} for n, expected in compat.iteritems(cases): result = idx.delete(n) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq with pytest.raises((IndexError, ValueError)): # either depeidnig on numpy version result = idx.delete(5)
def checkMovingOLS(self, x, y, window_type='rolling', **kwds): window = 25 # must be larger than rank of x with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): moving = ols(y=y, x=x, window_type=window_type, window=window, **kwds) index = moving._index for n, i in enumerate(moving._valid_indices): if window_type == 'rolling' and i >= window: prior_date = index[i - window + 1] else: prior_date = index[0] date = index[i] x_iter = {} for k, v in compat.iteritems(x): x_iter[k] = v.truncate(before=prior_date, after=date) y_iter = y.truncate(before=prior_date, after=date) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): static = ols(y=y_iter, x=x_iter, **kwds) self.compare(static, moving, event_index=i, result_index=n) _check_non_raw_results(moving)
def _make_table(self, ax, df, title, height=None): if df is None: ax.set_visible(False) return import pandas.tools.plotting as plotting idx_nlevels = df.index.nlevels col_nlevels = df.columns.nlevels # must be convert here to get index levels for colorization df = self._insert_index(df) tb = plotting.table(ax, df, loc=9) tb.set_fontsize(self.font_size) if height is None: height = 1.0 / (len(df) + 1) props = tb.properties() for (r, c), cell in compat.iteritems(props['celld']): if c == -1: cell.set_visible(False) elif r < col_nlevels and c < idx_nlevels: cell.set_visible(False) elif r < col_nlevels or c < idx_nlevels: cell.set_facecolor('#AAAAAA') cell.set_height(height) ax.set_title(title, size=self.font_size) ax.axis('off')
def _simple_new(cls, start, stop=None, step=None, name=None, dtype=None, **kwargs): result = object.__new__(cls) # handle passed None, non-integers if start is None and stop is None: # empty start, stop, step = 0, 0, 1 if start is None or not is_integer(start): try: return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) result._start = start result._stop = stop or 0 result._step = step or 1 result.name = name for k, v in compat.iteritems(kwargs): setattr(result, k, v) result._reset_identity() return result
def test_difference_base(self): for name, idx in compat.iteritems(self.indices): first = idx[2:] second = idx[:4] answer = idx[4:] result = first.difference(second) if isinstance(idx, CategoricalIndex): pass else: self.assertTrue(tm.equalContents(result, answer)) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, PeriodIndex): msg = "can only call with other PeriodIndex-ed objects" with tm.assertRaisesRegexp(ValueError, msg): result = first.difference(case) elif isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): self.assertEqual(result.__class__, answer.__class__) tm.assert_numpy_array_equal(result.asi8, answer.asi8) else: result = first.difference(case) self.assertTrue(tm.equalContents(result, answer)) if isinstance(idx, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with tm.assertRaisesRegexp(TypeError, msg): result = first.difference([1, 2, 3])
def test_symmetric_difference(self): for name, idx in compat.iteritems(self.indices): first = idx[1:] second = idx[:-1] if isinstance(idx, CategoricalIndex): pass else: answer = idx[[0, -1]] result = first.symmetric_difference(second) self.assertTrue(tm.equalContents(result, answer)) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, PeriodIndex): msg = "can only call with other PeriodIndex-ed objects" with tm.assertRaisesRegexp(ValueError, msg): result = first.symmetric_difference(case) elif isinstance(idx, CategoricalIndex): pass else: result = first.symmetric_difference(case) self.assertTrue(tm.equalContents(result, answer)) if isinstance(idx, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with tm.assertRaisesRegexp(TypeError, msg): result = first.symmetric_difference([1, 2, 3]) # 12591 deprecated with tm.assert_produces_warning(FutureWarning): first.sym_diff(second)
def test_ensure_copied_data(self): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 for name, index in compat.iteritems(self.indices): init_kwargs = {} if isinstance(index, PeriodIndex): # Needs "freq" specification: init_kwargs['freq'] = index.freq elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): # RangeIndex cannot be initialized from data # MultiIndex and CategoricalIndex are tested separately continue index_type = index.__class__ result = index_type(index.values, copy=True, **init_kwargs) tm.assert_index_equal(index, result) tm.assert_numpy_array_equal(index.values, result.values, check_same='copy') if not isinstance(index, PeriodIndex): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same='same') tm.assert_numpy_array_equal(index._values, result._values, check_same='same') else: # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index._values, result._values, check_same='same')
def test_union_base(self): for name, idx in compat.iteritems(self.indices): first = idx[3:] second = idx[:5] everything = idx union = first.union(second) self.assertTrue(tm.equalContents(union, everything)) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, PeriodIndex): msg = "can only call with other PeriodIndex-ed objects" with tm.assertRaisesRegexp(ValueError, msg): result = first.union(case) elif isinstance(idx, CategoricalIndex): pass else: result = first.union(case) self.assertTrue(tm.equalContents(result, everything)) if isinstance(idx, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with tm.assertRaisesRegexp(TypeError, msg): result = first.union([1, 2, 3])
def test_replace_input_formats(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({ 'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd'] }) filled = df.replace(to_rep, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], values[k]) assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({ 'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd'] }) assert_frame_equal(result, expected) # dict to scalar filled = df.replace(to_rep, 0) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], 0) assert_frame_equal(filled, DataFrame(expected)) self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({ 'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd'] }) filled = df.replace(np.nan, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(np.nan, values[k]) assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) self.assertRaises(ValueError, df.replace, to_rep, values[1:]) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)
class Resolution(object): # defined in period.pyx # note that these are different from freq codes RESO_US = period.US_RESO RESO_MS = period.MS_RESO RESO_SEC = period.S_RESO RESO_MIN = period.T_RESO RESO_HR = period.H_RESO RESO_DAY = period.D_RESO _reso_str_map = { RESO_US: 'microsecond', RESO_MS: 'millisecond', RESO_SEC: 'second', RESO_MIN: 'minute', RESO_HR: 'hour', RESO_DAY: 'day' } _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)]) _reso_freq_map = { 'year': 'A', 'quarter': 'Q', 'month': 'M', 'day': 'D', 'hour': 'H', 'minute': 'T', 'second': 'S', 'millisecond': 'L', 'microsecond': 'U', 'nanosecond': 'N' } _freq_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_freq_map)]) @classmethod def get_str(cls, reso): """ Return resolution str against resolution code. Example ------- >>> Resolution.get_str(Resolution.RESO_SEC) 'second' """ return cls._reso_str_map.get(reso, 'day') @classmethod def get_reso(cls, resostr): """ Return resolution str against resolution code. Example ------- >>> Resolution.get_reso('second') 2 >>> Resolution.get_reso('second') == Resolution.RESO_SEC True """ return cls._str_reso_map.get(resostr, cls.RESO_DAY) @classmethod def get_freq_group(cls, resostr): """ Return frequency str against resolution str. Example ------- >>> f.Resolution.get_freq_group('day') 4000 """ return get_freq_group(cls.get_freq(resostr)) @classmethod def get_freq(cls, resostr): """ Return frequency str against resolution str. Example ------- >>> f.Resolution.get_freq('day') 'D' """ return cls._reso_freq_map[resostr] @classmethod def get_str_from_freq(cls, freq): """ Return resolution str against frequency str. Example ------- >>> Resolution.get_str_from_freq('H') 'hour' """ return cls._freq_reso_map.get(freq, 'day') @classmethod def get_reso_from_freq(cls, freq): """ Return resolution code against frequency str. Example ------- >>> Resolution.get_reso_from_freq('H') 4 >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR True """ return cls.get_reso(cls.get_str_from_freq(freq))
def test_dict_iterators(self): assert next(itervalues({1: 2})) == 2 assert next(iterkeys({1: 2})) == 1 assert next(iteritems({1: 2})) == (1, 2)
def lreshape(data, groups, dropna=True, label=None): """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame groups : dict {new_name : list_of_columns} dropna : boolean, default True Examples -------- >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) team hr year 0 Red Sox 514 2007 1 Yankees 573 2007 2 Red Sox 545 2008 3 Yankees 526 2008 Returns ------- reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) values = list(groups.values()) else: keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) for seq in values: if len(seq) != K: raise ValueError('All column lists must be same length') mdata = {} pivot_cols = [] for target, names in zip(keys, values): to_concat = [data[col].values for col in names] mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: mdata[col] = np.tile(data[col].values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) for c in pivot_cols: mask &= notnull(mdata[c]) if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) return DataFrame(mdata, columns=id_cols + pivot_cols)
def intframe(self): # force these all to int64 to avoid platform testing issues return pd.DataFrame(dict([(c, s) for c, s in compat.iteritems(_intframe)]), dtype=np.int64)
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn(("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: msg = ('cannot perform renaming for {key} with a ' 'nested dictionary').format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any( isinstance(r, ABCSeries) for r in compat.itervalues(result)) def is_any_frame(): # return a boolean if we have *any* nested series return any( isinstance(r, ABCDataFrame) for r in compat.itervalues(result)) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
import numpy as np from pandas import compat from pandas.util._decorators import cache_readonly import pandas.util.testing as tm import pandas as pd _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() _frame = pd.DataFrame(_seriesd) _frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) _intframe = pd.DataFrame(dict((k, v.astype(int)) for k, v in compat.iteritems(_seriesd))) _tsframe = pd.DataFrame(_tsd) _mixed_frame = _frame.copy() _mixed_frame['foo'] = 'bar' class TestData(object): @cache_readonly def frame(self): return _frame.copy() @cache_readonly def frame2(self): return _frame2.copy()
def test_iteritems_names(self): for k, v in compat.iteritems(self.mixed_frame): assert v.name == k
class Resolution(object): RESO_US = RESO_US RESO_MS = RESO_MS RESO_SEC = RESO_SEC RESO_MIN = RESO_MIN RESO_HR = RESO_HR RESO_DAY = RESO_DAY _reso_str_map = { RESO_NS: 'nanosecond', RESO_US: 'microsecond', RESO_MS: 'millisecond', RESO_SEC: 'second', RESO_MIN: 'minute', RESO_HR: 'hour', RESO_DAY: 'day' } # factor to multiply a value by to convert it to the next finer grained # resolution _reso_mult_map = { RESO_NS: None, RESO_US: 1000, RESO_MS: 1000, RESO_SEC: 1000, RESO_MIN: 60, RESO_HR: 60, RESO_DAY: 24 } _reso_str_bump_map = { 'D': 'H', 'H': 'T', 'T': 'S', 'S': 'L', 'L': 'U', 'U': 'N', 'N': None } _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)]) _reso_freq_map = { 'year': 'A', 'quarter': 'Q', 'month': 'M', 'day': 'D', 'hour': 'H', 'minute': 'T', 'second': 'S', 'millisecond': 'L', 'microsecond': 'U', 'nanosecond': 'N' } _freq_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_freq_map)]) @classmethod def get_str(cls, reso): """ Return resolution str against resolution code. Example ------- >>> Resolution.get_str(Resolution.RESO_SEC) 'second' """ return cls._reso_str_map.get(reso, 'day') @classmethod def get_reso(cls, resostr): """ Return resolution str against resolution code. Example ------- >>> Resolution.get_reso('second') 2 >>> Resolution.get_reso('second') == Resolution.RESO_SEC True """ return cls._str_reso_map.get(resostr, cls.RESO_DAY) @classmethod def get_freq_group(cls, resostr): """ Return frequency str against resolution str. Example ------- >>> f.Resolution.get_freq_group('day') 4000 """ return get_freq_group(cls.get_freq(resostr)) @classmethod def get_freq(cls, resostr): """ Return frequency str against resolution str. Example ------- >>> f.Resolution.get_freq('day') 'D' """ return cls._reso_freq_map[resostr] @classmethod def get_str_from_freq(cls, freq): """ Return resolution str against frequency str. Example ------- >>> Resolution.get_str_from_freq('H') 'hour' """ return cls._freq_reso_map.get(freq, 'day') @classmethod def get_reso_from_freq(cls, freq): """ Return resolution code against frequency str. Example ------- >>> Resolution.get_reso_from_freq('H') 4 >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR True """ return cls.get_reso(cls.get_str_from_freq(freq)) @classmethod def get_stride_from_decimal(cls, value, freq): """ Convert freq with decimal stride into a higher freq with integer stride Parameters ---------- value : integer or float freq : string Frequency string Raises ------ ValueError If the float cannot be converted to an integer at any resolution. Example ------- >>> Resolution.get_stride_from_decimal(1.5, 'T') (90, 'S') >>> Resolution.get_stride_from_decimal(1.04, 'H') (3744, 'S') >>> Resolution.get_stride_from_decimal(1, 'D') (1, 'D') """ if np.isclose(value % 1, 0): return int(value), freq else: start_reso = cls.get_reso_from_freq(freq) if start_reso == 0: raise ValueError( "Could not convert to integer offset at any resolution") next_value = cls._reso_mult_map[start_reso] * value next_name = cls._reso_str_bump_map[freq] return cls.get_stride_from_decimal(next_value, next_name)
def test_iteritems(self): df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) for k, v in compat.iteritems(df): assert isinstance(v, self.klass._constructor_sliced)
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.'): """ "Normalize" semi-structured JSON data into a flat table Parameters ---------- data : dict or list of dicts Unserialized JSON objects record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' * 'ignore' : will ignore KeyError if keys listed in meta are not always present * 'raise' : will raise KeyError if keys listed in meta are not always present .. versionadded:: 0.20.0 sep : string, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 Returns ------- frame : DataFrame Examples -------- >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { ... 'governor': 'Rick Scott' ... }, ... 'counties': [{'name': 'Dade', 'population': 12345}, ... {'name': 'Broward', 'population': 40000}, ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', ... 'info': { ... 'governor': 'John Kasich' ... }, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> from pandas.io.json import json_normalize >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population info.governor state shortname 0 Dade 12345 Rick Scott Florida FL 1 Broward 40000 Rick Scott Florida FL 2 Palm Beach 60000 Rick Scott Florida FL 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH """ def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] return result if isinstance(data, list) and len(data) == 0: return DataFrame() # A bit of a hackjob if isinstance(data, dict): data = [data] if record_path is None: if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] if meta is None: meta = [] elif not isinstance(meta, list): meta = [meta] for i, x in enumerate(meta): if not isinstance(x, list): meta[i] = [x] # Disastrously inefficient for now records = [] lengths = [] meta_vals = defaultdict(list) if not isinstance(sep, compat.string_types): sep = str(sep) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: if errors == 'ignore': meta_val = np.nan else: raise \ KeyError("Try running with " "errors='ignore' as key " "%s is not always present", e) meta_vals[key].append(meta_val) records.extend(recs) _recursive_extract(data, record_path, {}, level=0) result = DataFrame(records) if record_prefix is not None: result.rename(columns=lambda x: record_prefix + x, inplace=True) # Data types, a problem for k, v in compat.iteritems(meta_vals): if meta_prefix is not None: k = meta_prefix + k if k in result: raise ValueError('Conflicting metadata name %s, ' 'need distinguishing prefix ' % k) result[k] = np.array(v).repeat(lengths) return result
def test_combineSeries(self): # Series series = self.frame.xs(self.frame.index[0]) added = self.frame + series for key, s in compat.iteritems(added): assert_series_equal(s, self.frame[key] + series[key]) larger_series = series.to_dict() larger_series['E'] = 1 larger_series = Series(larger_series) larger_added = self.frame + larger_series for key, s in compat.iteritems(self.frame): assert_series_equal(larger_added[key], s + series[key]) self.assertIn('E', larger_added) self.assertTrue(np.isnan(larger_added['E']).all()) # vs mix (upcast) as needed added = self.mixed_float + series _check_mixed_float(added, dtype='float64') added = self.mixed_float + series.astype('float32') _check_mixed_float(added, dtype=dict(C=None)) added = self.mixed_float + series.astype('float16') _check_mixed_float(added, dtype=dict(C=None)) # these raise with numexpr.....as we are adding an int64 to an # uint64....weird vs int # added = self.mixed_int + (100*series).astype('int64') # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = # 'int64', D = 'int64')) # added = self.mixed_int + (100*series).astype('int32') # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = # 'int32', D = 'int64')) # TimeSeries ts = self.tsframe['A'] # 10890 # we no longer allow auto timeseries broadcasting # and require explict broadcasting added = self.tsframe.add(ts, axis='index') for key, col in compat.iteritems(self.tsframe): result = col + ts assert_series_equal(added[key], result, check_names=False) self.assertEqual(added[key].name, key) if col.name == ts.name: self.assertEqual(result.name, 'A') else: self.assertTrue(result.name is None) smaller_frame = self.tsframe[:-5] smaller_added = smaller_frame.add(ts, axis='index') self.assertTrue(smaller_added.index.equals(self.tsframe.index)) smaller_ts = ts[:-5] smaller_added2 = self.tsframe.add(smaller_ts, axis='index') assert_frame_equal(smaller_added, smaller_added2) # length 0, result is all-nan result = self.tsframe.add(ts[:0], axis='index') expected = DataFrame(np.nan, index=self.tsframe.index, columns=self.tsframe.columns) assert_frame_equal(result, expected) # Frame is all-nan result = self.tsframe[:0].add(ts, axis='index') expected = DataFrame(np.nan, index=self.tsframe.index, columns=self.tsframe.columns) assert_frame_equal(result, expected) # empty but with non-empty index frame = self.tsframe[:1].reindex(columns=[]) result = frame.mul(ts, axis='index') self.assertEqual(len(result), len(ts))
def test_constructor_subclass_dict(self): data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) series = self.series_klass(data) expected = self.series_klass(dict(compat.iteritems(data))) self._assert_series_equal(series, expected)
'Min': 'T', 'min': 'T', 'ms': 'L', 'us': 'U' } #TODO: Can this be killed? for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): for _iweek in range(4): _name = 'WOM-%d%s' % (_iweek + 1, _weekday) _rule_aliases[_name.replace('-', '@')] = _name # Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal # order matters when constructing an inverse. we pick one. #2331 _legacy_reverse_map = dict( (v, k) for k, v in reversed(sorted(compat.iteritems(_rule_aliases)))) def to_offset(freqstr): """ Return DateOffset object from string representation Examples -------- >>> to_offset('5Min') Minute(5) """ if freqstr is None: return None if isinstance(freqstr, DateOffset):
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) arg = new_arg from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(compat.iterkeys(arg))): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) return result, True elif hasattr(arg, '__iter__'): return self._aggregate_multiple_funcs(arg, _level=_level), None else: result = None cy_func = self._is_cython_func(arg) if cy_func and not args and not kwargs: return getattr(self, cy_func)(), None # caller can react return result, True
def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame Parameters ---------- func : function Function to apply to each column axis : {0, 1, 'index', 'columns'} broadcast : bool, default False For aggregation functions, return object of same size with values propagated .. deprecated:: 0.23.0 This argument will be removed in a future version, replaced by result_type='broadcast'. reduce : boolean or None, default None Try to apply reduction procedures. If the DataFrame is empty, apply will use reduce to determine whether the result should be a Series or a DataFrame. If reduce is None (the default), apply's return value will be guessed by calling func an empty Series (note: while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. .. deprecated:: 0.23.0 This argument will be removed in a future version, replaced by result_type='reduce'. result_type : {'expand', 'reduce', 'broadcast, None} These only act when axis=1 {columns}: * 'expand' : list-like results will be turned into columns. * 'reduce' : return a Series if possible rather than expanding list-like results. This is the opposite to 'expand'. * 'broadcast' : results will be broadcast to the original shape of the frame, the original index & columns will be retained. The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. .. versionadded:: 0.23.0 Returns ------- applied : Series or SparseDataFrame """ if not len(self.columns): return self axis = self._get_axis_number(axis) if isinstance(func, np.ufunc): new_series = {} for k, v in compat.iteritems(self): applied = func(v) applied.fill_value = func(v.fill_value) new_series[k] = applied return self._constructor( new_series, index=self.index, columns=self.columns, default_fill_value=self._default_fill_value, default_kind=self._default_kind).__finalize__(self) from pandas.core.apply import frame_apply op = frame_apply(self, func=func, axis=axis, reduce=reduce, broadcast=broadcast, result_type=result_type) return op.get_result()
def test_constructor_subclass_dict(self): data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) series = Series(data) refseries = Series(dict(compat.iteritems(data))) assert_series_equal(refseries, series)
def test_iterkv_names(self): for k, v in compat.iteritems(self.mixed_frame): self.assertEqual(v.name, k)
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i], )) if convert_categoricals: cols = np.where( lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems( self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) return data
def _check_cast(df, v): self.assertEqual( list(set([s.dtype.name for _, s in compat.iteritems(df)]))[0], v)
def test_deepcopy(self): cp = deepcopy(self.frame) series = cp['A'] series[:] = 10 for idx, value in compat.iteritems(series): assert self.frame['A'][idx] != value
def _check_cast(df, v): assert (list(set([s.dtype.name for _, s in compat.iteritems(df)]))[0] == v)
def test_series_put_names(self): series = self.mixed_frame._series for k, v in compat.iteritems(series): assert v.name == k
def test_deepcopy(self): cp = deepcopy(self.frame) series = cp['A'] series[:] = 10 for idx, value in compat.iteritems(series): self.assertNotEqual(self.frame['A'][idx], value)
def test_iteritems(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) for k, v in compat.iteritems(df): self.assertEqual(type(v), Series)
def _zip_axes_from_type(typ, new_axes): axes = {} for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): axes[ax_name] = new_axes[ax_ind] return axes
def test_to_dict(self): test_data = { 'A': { '1': 1, '2': 2 }, 'B': { '1': '1', '2': '2', '3': '3' }, } recons_data = DataFrame(test_data).to_dict() for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("l") for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k][int(k2) - 1]) recons_data = DataFrame(test_data).to_dict("s") for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k][k2]) recons_data = DataFrame(test_data).to_dict("sp") expected_split = { 'columns': ['A', 'B'], 'index': ['1', '2', '3'], 'data': [[1.0, '1'], [2.0, '2'], [nan, '3']] } tm.assert_almost_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r") expected_records = [{ 'A': 1.0, 'B': '1' }, { 'A': 2.0, 'B': '2' }, { 'A': nan, 'B': '3' }] tm.assert_almost_equal(recons_data, expected_records) # GH10844 recons_data = DataFrame(test_data).to_dict("i") for k, v in compat.iteritems(test_data): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k2][k])
def test_series_put_names(self): series = self.mixed_frame._series for k, v in compat.iteritems(series): self.assertEqual(v.name, k)
def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): assert (np.array_equal(v, right[k]))
for path in files: if not _should_count_file(path) or path.startswith('test_'): continue full_path = os.path.join(directory, path) counts[full_path] = _get_file_function_lengths(full_path) return counts counts = doit2() # counts = _get_file_function_lengths('pandas/tests/test_series.py') all_counts = [] for k, v in compat.iteritems(counts): all_counts.extend(v) all_counts = np.array(all_counts) fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(111) ax.hist(all_counts, bins=100) n = len(all_counts) nmore = (all_counts > 50).sum() ax.set_title('%s function lengths, n=%d' % ('pandas', n)) ax.set_ylabel('N functions') ax.set_xlabel('Function length') ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)), fontsize=18)