class Size(object): def setup(self): n = 10**5 offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') dates = np.datetime64('now') + offsets self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), 'key2': np.random.randint(0, 100, size=n), 'value1': np.random.randn(n), 'value2': np.random.randn(n), 'value3': np.random.randn(n), 'dates': dates}) self.draws = Series(np.random.randn(n)) labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() def time_dt_timegrouper_size(self): with warnings.catch_warnings(record=True): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() def time_category_size(self): self.draws.groupby(self.cats).size()
def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b").mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) tm.assert_numpy_array_equal(result, exp)
class ApplyDictReturn(object): def setup(self): self.labels = np.arange(1000).repeat(10) self.data = Series(np.random.randn(len(self.labels))) def time_groupby_apply_dict_return(self): self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0], 'last': x.values[-1]})
def test_bins_unequal_len(): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here with pytest.raises(ValueError): series.groupby(bins).mean()
class TestTimeGrouper(unittest.TestCase): def setUp(self): self.ts = Series(np.random.randn(1000), index=date_range('1/1/2000', periods=1000)) def test_apply(self): grouper = TimeGrouper('A', label='right', closed='right') grouped = self.ts.groupby(grouper) f = lambda x: x.order()[-3:] applied = grouped.apply(f) expected = self.ts.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0) assert_series_equal(applied, expected) def test_count(self): self.ts[::3] = np.nan grouper = TimeGrouper('A', label='right', closed='right') result = self.ts.resample('A', how='count') expected = self.ts.groupby(lambda x: x.year).count() expected.index = result.index assert_series_equal(result, expected) def test_numpy_reduction(self): result = self.ts.resample('A', how='prod', closed='right') expected = self.ts.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index assert_series_equal(result, expected) def test_apply_iteration(self): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open':1, 'close':2}, index=ind) tg = TimeGrouper('M') grouper = tg.get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) f = lambda df: df['close'] / df['open'] # it works! result = grouped.apply(f) self.assertTrue(result.index.equals(df.index))
class TestTimeSeriesDuplicates(unittest.TestCase): def setUp(self): dates = [ datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 3), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 4), datetime(2000, 1, 4), datetime(2000, 1, 5), ] self.dups = Series(np.random.randn(len(dates)), index=dates) def test_constructor(self): self.assert_(isinstance(self.dups, TimeSeries)) self.assert_(isinstance(self.dups.index, DatetimeIndex)) def test_is_unique_monotonic(self): self.assert_(not self.dups.index.is_unique) def test_index_unique(self): uniques = self.dups.index.unique() self.assert_(uniques.dtype == "M8") # sanity def test_duplicate_dates_indexing(self): ts = self.dups uniques = ts.index.unique() for date in uniques: result = ts[date] mask = ts.index == date total = (ts.index == date).sum() expected = ts[mask] if total > 1: assert_series_equal(result, expected) else: assert_almost_equal(result, expected[0]) cp = ts.copy() cp[date] = 0 expected = Series(np.where(mask, 0, ts), index=ts.index) assert_series_equal(cp, expected) self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) self.assertRaises(KeyError, ts.__setitem__, datetime(2000, 1, 6), 0) def test_groupby_average_dup_values(self): result = self.dups.groupby(level=0).mean() expected = self.dups.groupby(self.dups.index).mean() assert_series_equal(result, expected)
def test_groupby_count_dateparseerror(self): dr = date_range(start='1/1/2012', freq='5min', periods=10) # BAD Example, datetimes first s = Series(np.arange(10), index=[dr, lrange(10)]) grouped = s.groupby(lambda x: x[1] % 2 == 0) result = grouped.count() s = Series(np.arange(10), index=[lrange(10), dr]) grouped = s.groupby(lambda x: x[0] % 2 == 0) expected = grouped.count() assert_series_equal(result, expected)
def test_filter_against_workaround(): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters)}) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters. transform(lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) tm.assert_frame_equal(new_way, old_way)
def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]] ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))
def date_11(): from pandas.tseries.offsets import Day,MonthEnd now=datetime(2011,11,17) print now+3*Day() print now+MonthEnd() print now+MonthEnd(2) offset=MonthEnd() print offset.rollforward(now) print offset.rollback(now) ts=Series(np.random.randn(20),index=pd.date_range('1/15/2000',periods=20,freq='4d')) print ts.groupby(offset.rollforward).mean() print ts.resample('M',how='mean')
def test_groupby_grouper_f_sanity_checked(self): dates = date_range('01-Jan-2013', periods=12, freq='MS') ts = Series(np.random.randn(12), index=dates) # GH3035 # index.map is used to apply grouper to the index # if it fails on the elements, map tries it on the entire index as # a sequence. That can yield invalid results that cause trouble # down the line. # the surprise comes from using key[0:6] rather then str(key)[0:6] # when the elements are Timestamp. # the result is Index[0:6], very confusing. msg = r"Grouper result violates len\(labels\) == len\(data\)" with pytest.raises(AssertionError, match=msg): ts.groupby(lambda key: key[0:6])
def test_hourly(self): rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24), freq='H') data_hourly = np.random.randint(100, 350, rng_hourly.size) ts_hourly = Series(data_hourly, index=rng_hourly) grouped = ts_hourly.groupby(ts_hourly.index.year) hoy = grouped.apply(lambda x: x.reset_index(drop=True)) hoy = hoy.index.droplevel(0).values hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 annual = pivot_annual(ts_hourly) ts_hourly = ts_hourly.astype(float) for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: subset = ts_hourly[hoy == i] subset.index = [x.year for x in subset.index] result = annual[i].dropna() tm.assert_series_equal(result, subset, check_names=False) self.assertEqual(result.name, i) leaps = ts_hourly[(ts_hourly.index.month == 2) & ( ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)] hour = leaps.index.dayofyear[0] * 24 - 23 leaps.index = leaps.index.year leaps.name = 1417 tm.assert_series_equal(annual[hour].dropna(), leaps)
def test_custom_grouper(self): dti = DatetimeIndex(freq="Min", start=datetime(2005, 1, 1), end=datetime(2005, 1, 10)) data = np.array([1] * len(dti)) s = Series(data, index=dti) b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [5] * 2592 arr.append(1) idx = dti[0:-1:5] idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])])) expect = Series(arr, index=idx) # cython returns float for now result = g.agg(np.sum) assert_series_equal(result, expect.astype(float)) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti) r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def _render_dimensional_metric_cell(row_data: pd.Series, metric: Metric): """ Renders a table cell in a metric column for pivoted tables where there are two or more dimensions. This function is recursive to traverse multi-dimensional indices. :param row_data: A series containing the value for the metric and it's index (for the dimension values). :param metric: A reference to the slicer metric to access the display formatting. :return: A deep dict in a tree structure with keys matching each dimension level. The top level will have keys matching the first level of dimension values, and the next level will contain the next level of dimension values, for as many index levels as there are. The last level will contain the return value of `_format_metric_cell`. """ level = {} # Group by the last dimension, drop it, and fill the dict with either the raw metric values or the next level of # dicts. for key, next_row in row_data.groupby(level=1): next_row.reset_index(level=1, drop=True, inplace=True) df_key = format_metric_key(metric.key) level[key] = _render_dimensional_metric_cell(next_row, metric) \ if isinstance(next_row.index, pd.MultiIndex) \ else _format_metric_cell(next_row[df_key], metric) return level
def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() df['E'] = True df['F'] = 1 # tests for first / last / nth grouped = df.groupby('A') first = grouped.first() expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last() expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1) expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = lrange(10) idx.append(9) s = Series(data=lrange(11), index=idx, name='IntCol') assert s.dtype == 'int64' f = s.groupby(level=0).first() assert f.dtype == 'int64'
def test_custom_grouper(self): dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1), end=datetime(2005,1,10)) data = np.array([1]*len(dti)) s = Series(data, index=dti) b = TimeGrouper(Minute(5)) g = s.groupby(b) self.assertEquals(g.ngroups, 2593) # construct expected val arr = [5] * 2592 arr.append(1) idx = dti[0:-1:5] idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])])) expect = Series(arr, index=idx) # cython returns float for now result = g.agg(np.sum) assert_series_equal(result, expect.astype(float)) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti) r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_nsmallest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list('a' * 5 + 'b' * 5)) gb = a.groupby(b) r = gb.nsmallest(3) e = Series([ 1, 2, 3, 0, 4, 6 ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) e = Series([ 0, 1, 1, 0, 1, 2 ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
def quarter_plot(x, dates=None, ylabel=None, ax=None): """ Seasonal plot of quarterly data Parameters ---------- x : array-like Seasonal data to plot. If dates is None, x must be a pandas object with a PeriodIndex or DatetimeIndex with a monthly frequency. dates : array-like, optional If `x` is not a pandas object, then dates must be supplied. ylabel : str, optional The label for the y-axis. Will attempt to use the `name` attribute of the Series. ax : matplotlib.axes, optional Existing axes instance. Returns ------- matplotlib.Figure """ from pandas import DataFrame if dates is None: from statsmodels.tools.data import _check_period_index _check_period_index(x, freq="Q") else: from pandas import Series, PeriodIndex x = Series(x, index=PeriodIndex(dates, freq="Q")) xticklabels = ['q1', 'q2', 'q3', 'q4'] return seasonal_plot(x.groupby(lambda y : y.quarter), xticklabels, ylabel=ylabel, ax=ax)
def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0))
def test_cython_fail_agg(self): dr = bdate_range('1/1/2000', periods=50) ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() expected = grouped.agg(np.sum) assert_series_equal(summed, expected)
def test_intercept_builtin_sum(): s = Series([1., 2., np.nan, 3.]) grouped = s.groupby([0, 1, 2, 2]) result = grouped.agg(compat.builtins.sum) result2 = grouped.apply(compat.builtins.sum) expected = grouped.sum() tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected)
class DateAttributes(object): def setup(self): rng = date_range('1/1/2000', '12/31/2005', freq='H') self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) def time_len_groupby_object(self): len(self.ts.groupby([self.year, self.month, self.day]))
def date_06(): dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000']) dup_ts=Series(np.arange(5),index=dates) print dup_ts print dup_ts.index.is_unique print dup_ts['1/3/2000'] print dup_ts['1/2/2000'] grouped=dup_ts.groupby(level=0) print grouped.mean() print grouped.count()
def test_custom_grouper(self): dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1), end=datetime(2005,1,10)) data = np.array([1]*len(dti)) s = Series(data, index=dti) b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) b = TimeGrouper(Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # cython returns float for now result = g.agg(np.sum) assert_series_equal(result, expect.astype(float)) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti) r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_downsample_non_unique(self): rng = date_range('1/1/2000', '2/29/2000') rng2 = rng.repeat(5).values ts = Series(np.random.randn(len(rng2)), index=rng2) result = ts.resample('M', how='mean') expected = ts.groupby(lambda x: x.month).mean() self.assertEquals(len(result), 2) assert_almost_equal(result[0], expected[1]) assert_almost_equal(result[1], expected[2])
class Size: def setup(self): n = 10**5 offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') dates = np.datetime64('now') + offsets self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), 'key2': np.random.randint(0, 100, size=n), 'value1': np.random.randn(n), 'value2': np.random.randn(n), 'value3': np.random.randn(n), 'dates': dates}) self.draws = Series(np.random.randn(n)) labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() def time_category_size(self): self.draws.groupby(self.cats).size()
def test_groupby_level_with_nas(self, sort): # GH 17537 index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6., 22.], index=[0, 1]) assert_series_equal(result, expected) index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) result = s.groupby(level=0, sort=sort).sum() expected = Series([6., 18.], index=[0.0, 1.0]) assert_series_equal(result, expected)
def slide7(): from pandas.tseries.offsets import Hour, Minute hour = Hour() print hour four_hours = Hour(4) print four_hours print pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h') print Hour(2) + Minute(30) print pd.date_range('1/1/2000', periods=10, freq='1h30min') ts = Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4, freq='M')) print ts print ts.shift(2) print ts.shift(-2) print '2 M' print ts.shift(2, freq='M') print '3 D' print ts.shift(3, freq='D') print '1 3D' print ts.shift(1, freq='3D') print '1 90T' print ts.shift(1, freq='90T') print 'shifting dates with offsets' from pandas.tseries.offsets import Day, MonthEnd now = datetime(2011, 11, 17) print now + 3 * Day() print now + MonthEnd() print now + MonthEnd(2) offset = MonthEnd() print offset print offset.rollforward(now) print offset.rollback(now) ts = Series(np.random.randn(20), index=pd.date_range('1/15/2000', periods=20, freq='4d')) print ts.groupby(offset.rollforward).mean()
def slide4(): dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)] print 'Series sample' ts = Series(np.random.randn(6), index=dates) print ts print type(ts) print ts.index print 'arithmetic operations' print ts + ts[::2] print ts.index.dtype stamp = ts.index[2] print stamp print 'indexing' print ts[stamp] print ts['1/10/2011'] print ts['20110110'] longer_ts = Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) print 'longer timestamp' print longer_ts print longer_ts['2001'] print longer_ts['2001-05'] print 'indexing range' print ts[datetime(2011, 1, 7):] print ts['1/6/2011':'1/11/2011'] print 'truncate' print ts.truncate(after='1/9/2011') dates = pd.date_range('1/1/2000', periods=100, freq='W-WED') long_df = DataFrame(np.random.randn(100, 4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio']) print long_df.ix['5-2001'] print 'duplicate' dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000']) dup_ts = Series(np.arange(5), index=dates) print dup_ts print dup_ts.index.is_unique print dup_ts['1/3/2000'] print dup_ts['1/2/2000'] grouped = dup_ts.groupby(level=0) print grouped.mean() print grouped.count()
def test_custom_grouper(self): dti = DatetimeIndex(freq="Min", start=datetime(2005, 1, 1), end=datetime(2005, 1, 10)) s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) b = TimeGrouper(Minute(5), closed="right", label="right") g = s.groupby(b) # check all cython functions work funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # GH2763 - return in put dtype if we can result = g.agg(np.sum) assert_series_equal(result, expect) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64") r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_filter_series(): s = Series([1, 3, 20, 5, 22, 24, 7]) expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6]) expected_even = Series([20, 22, 24], index=[2, 4, 5]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_series_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), expected_odd.reindex(s.index), ) tm.assert_series_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), expected_even.reindex(s.index), )
def test_multi_iter(self): s = Series(np.arange(6)) k1 = np.array(["a", "a", "a", "b", "b", "b"]) k2 = np.array(["1", "2", "1", "2", "1", "2"]) grouped = s.groupby([k1, k2]) iterated = list(grouped) expected = [ ("a", "1", s[[0, 2]]), ("a", "2", s[[1]]), ("b", "1", s[[4]]), ("b", "2", s[[3, 5]]), ] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one assert e2 == two tm.assert_series_equal(three, e3)
def test_groupby_rolling_index_changed(self, func): # GH: #36018 nlevels of MultiIndex changed ds = Series( [1, 2, 2], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"]), name="a", ) result = getattr(ds.groupby(ds).rolling(2), func)() expected = Series( [np.nan, np.nan, 2.0], index=pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"]), name="a", ) tm.assert_series_equal(result, expected)
def ts_rank(x: pd.Series, d: int or float) -> pd.Series: """ ts_rank(x, d) = time-series rank in the past d days :param x: :param d: :return: """ if isinstance(d, float): d = math.floor(d) def func(a): # 这里sort两次是啥意思? return a.argsort().argsort()[-1] + 1 if isinstance(x.index, pd.MultiIndex): return x.groupby( level=1).rolling(d).apply(func).droplevel(0).sort_index() else: return x.rolling(d).apply(func)
def test_nlargest_mi_grouper(): # see gh-21411 npr = np.random.RandomState(123456789) dts = date_range("20180101", periods=10) iterables = [dts, ["one", "two"]] idx = MultiIndex.from_product(iterables, names=["first", "second"]) s = Series(npr.randn(20), index=idx) result = s.groupby("first").nlargest(1) exp_idx = MultiIndex.from_tuples( [ (dts[0], dts[0], "one"), (dts[1], dts[1], "one"), (dts[2], dts[2], "one"), (dts[3], dts[3], "two"), (dts[4], dts[4], "one"), (dts[5], dts[5], "one"), (dts[6], dts[6], "one"), (dts[7], dts[7], "one"), (dts[8], dts[8], "two"), (dts[9], dts[9], "one"), ], names=["first", "first", "second"], ) exp_values = [ 2.2129019979039612, 1.8417114045748335, 0.858963679564603, 1.3759151378258088, 0.9430284594687134, 0.5296914208183142, 0.8318045593815487, -0.8476703342910327, 0.3804446884133735, -0.8028845810770998, ] expected = Series(exp_values, index=exp_idx) tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
def build_ror_data(ror_capacity_ds: pd.Series, timestamps: pd.DatetimeIndex, runoff_dataset: xr.Dataset, runoff_points_region_ds: pd.Series) -> Tuple[pd.Series, pd.DataFrame]: """ Compute total ROR capacities (in GW) and inflow (p.u. of capacity) for a series of regions. Parameters ---------- ror_capacity_ds: pd.Series Series containing ROR power (GW) capacity per plant, indexed by the region in which the plant is located. timestamps: pd.DatetimeIndex Time stamps over which the inflows must be computed. runoff_dataset: xr.Dataset ERA5 runoff dataset runoff_points_region_ds: pd.Series Indicates in which region each ERA5 point falls. Returns ------- ror_capacity_ds: pd.Series Series containing ROR power (GW) capacity per region. ror_inflows_df: pd.DataFrame ROR inflow time-series (p.u. of power capacity) for each region. """ ror_thresholds_fn = f"{data_path}generation/hydro/source/ror_flood_event_thresholds.csv" ror_thresholds = pd.read_csv(ror_thresholds_fn, index_col=0) ror_capacity_ds = ror_capacity_ds.groupby(ror_capacity_ds.index).sum() * 1e-3 ror_inflows_df = pd.DataFrame(index=timestamps, columns=ror_capacity_ds.index) for region in ror_capacity_ds.index: points = runoff_points_region_ds[runoff_points_region_ds == region].index.to_list() flood_event_threshold = ror_thresholds.loc[replace_iso2_codes([region[:2]])[0], 'value'] if points: ror_inflows_df[region] = compute_ror_series(runoff_dataset, points, flood_event_threshold) ror_inflows_df.dropna(axis=1, inplace=True) missing_inflows_indexes = ~ror_capacity_ds.index.isin(ror_inflows_df.columns) missing_ror = ror_capacity_ds.loc[missing_inflows_indexes].dropna().sum() ror_capacity_ds = ror_capacity_ds[ror_inflows_df.columns] logger.info(f'ROR capacity factors computed. ' f'{missing_ror} GW removed because of ERA5 point unavailability in regions.') return ror_capacity_ds, ror_inflows_df
def test_series_groupby_plotting_nominally_works(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): gender = np.random.choice(['male', 'female'], size=n) weight.groupby(gender).plot() tm.close() height.groupby(gender).hist() tm.close() # Regression test for GH8733 height.groupby(gender).plot(alpha=0.5) tm.close()
def compute_ytd_returns(time_series: pd.Series): """ Compute year-to-date from a pandas time series :param time_series: pandas time series :return: pandas time series """ ytd_returns = [] group_by_object = time_series.groupby(pd.Grouper(freq='A')) for group_name, indexes in group_by_object.indices.items(): start_index = indexes[0] - 1 if indexes[0] > 0 else indexes[0] for idx in indexes: ytd_return = time_series[idx] / time_series[start_index] - 1.0 ytd_returns.append(ytd_return) result = pd.Series(index=time_series.index, data=ytd_returns) assert len(result) == len(time_series) return result
def _group_mean(weights, ordered_snapshots): w = Series(weights) w = w.reindex(ordered_snapshots) nstates = 16 snapshots_per_state = len(w) / nstates state_indices = [] for i in range(nstates): state_indices += [i] * snapshots_per_state assert len(state_indices) == len(w) g_mean = w.groupby(state_indices).mean() states = [] mean_w = [] for i, v in g_mean.iteritems(): states.append(i) mean_w.append(v) return np.array(states, dtype=int), np.array(mean_w, dtype=float)
def _calibrate_by_index(value_df: pd.DataFrame, calib_series: pd.Series, levels: Union[int, Tuple[int]], ) -> pd.DataFrame: if not hasattr(levels, '__len__'): levels = (levels, ) value_factors = value_df.groupby(level=levels).sum() calib_sums = calib_series.groupby(level=levels).sum() # calib_fractions = value_sums.join(calib_sums, how='outer') for col in value_factors: value_factors[col] = (calib_sums / value_factors[col]).fillna(1) # raise RuntimeError with_coefs = value_df.join( value_factors, on=[name for i, name in enumerate(value_df.index.names) if i in levels], rsuffix='_coef' ) for col in value_df.columns: with_coefs[col] *= with_coefs[col + '_coef'] return with_coefs[value_df.columns.tolist()]
def get_feature(x: pd.Series, date): x = x.dropna() name = x.name x = pd.DataFrame(x.to_list(), columns=["road_id", "speed", "clock"]) x = x[x["speed"] <= 30] x["clock"] = x.apply(lambda x: pd.Timestamp(2019, int(date[ 0:2]), int(date[2:]), x["clock"][0], x["clock"][1] // 10 * 10, 0), axis=1) ret = x.groupby(by=["road_id", "clock"], as_index=False).agg({"speed": ["mean", lambda x: len(x)]}) ret.columns = ["road_id", "time", "avg", "count"] ret = ret[ret["count"] >= 5] ret["is_low"] = (ret["avg"] <= 6).astype(np.float64) # ret["is_high"] = (ret["avg"] >= 14).astype(np.float64) del ret["count"] return ret
def decay_linear(x: pd.Series, d: int) -> pd.Series: """ decay_linear(x, d) = weighted moving average over the past d days with linearly decaying weights d, d – 1, ..., 1 (rescaled to sum up to 1) :param x: :param d: :return: """ # todo https://www.joinquant.com/community/post/detailMobile?postId=10674&page=&limit=20&replyId=&tag= if isinstance(d, float): d = math.floor(d) def func(a): weights = np.arange(1, d + 1) weights = weights / weights.sum() return np.nansum(weights * a) if isinstance(x.index, pd.MultiIndex): return x.groupby(level=1).rolling(d).apply(func) else: return x.rolling(d).apply(func)
def _fit_distributors( self, original: pd.Series, transformed: pd.Series, ) -> List[Distributor]: distributors = [] for cat, bin_vals in original.groupby(transformed): if len(bin_vals.index) > 0: n_unique = bin_vals.nunique() if n_unique < self.min_unique_continuous: d = self.discrete_distributor.copy() else: d = self.continuous_distributor.copy() d.fit(bin_vals.values) else: # no values in bin, return a mean-producing distributor # at the center of the interval d = MeanDistributor(seed=self.continuous_distributor.seed) d.fit(np.array([cat.left, cat.right])) distributors.append(d) return distributors
def test_groupby_rolling_group_keys(self, group_keys): # GH 37641 # GH 38523: GH 37641 actually was not a bug. # group_keys only applies to groupby.apply directly arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) s = Series([1, 2, 3], index=index) result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean() expected = Series( [1.0, 2.0, 3.0], index=MultiIndex.from_tuples( [ ("val1", "val1", "val1", "val1"), ("val1", "val1", "val1", "val1"), ("val2", "val2", "val2", "val2"), ], names=["idx1", "idx2", "idx1", "idx2"], ), ) tm.assert_series_equal(result, expected)
def test_resample_basic(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', name='index') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', how='mean', closed='right', label='right') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], index=date_range('1/1/2000', periods=4, freq='5min')) assert_series_equal(result, expected) self.assertEqual(result.index.name, 'index') result = s.resample('5min', how='mean', closed='left', label='right') expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], index=date_range('1/1/2000 00:05', periods=3, freq='5min')) assert_series_equal(result, expected) s = self.series result = s.resample('5Min', how='last') grouper = TimeGrouper(Minute(5), closed='left', label='left') expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect)
def calculate_cross_section_factor_returns(data: pd.DataFrame, position: pd.Series, price_key='close', factor_name='cross_sectional_factor') -> pd.DataFrame: """ :param data: :param position: :param price_key: :param factor_name: :return: """ # todo returns with different holding period # first shift the factor by date, because the factor can only decide future return shifted_position = position.groupby(level=1).shift(1) rate_of_return = data[price_key].groupby(level=1).pct_change() # do multiple elementwise factor_returns = pd.DataFrame(shifted_position.values * rate_of_return.values, index=rate_of_return.index) # sum up the return in the same date. factor_returns = factor_returns.groupby(level=0).sum() factor_returns.rename({0: factor_name}, axis=1, inplace=True) return factor_returns
def part2(): """时间序列基础""" # 1、pandas最基本的时间序列:以时间戳为索引的Series dates = [datetime(2011,1,2), datetime(2011,1,5), datetime(2011,1,7), datetime(2011,1,8), datetime(2011,1,10), datetime(2011,1,12)] ts = Series(np.random.rand(6), index=dates) # print(ts) # print(type(ts)) # print(ts.index) # print(ts.index[0]) # 索引、选取、子集构造 # print(ts[ts.index[2]]) # 利用索引 # print(ts['1/10/2011']) # 利用被解释为日期的字符串 longer_ts = Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) # print(longer_ts) # print(longer_ts['2001-05']) # 较长的时间序列,只需传入年或者年月,即可选取数据切片 # print(ts[datetime(2011, 1, 7)]) # print(ts['1/6/2011': '1/10/2011']) # 范围查询 dates = pd.date_range('1/1/2000', periods=100, freq='W-WED') long_df = DataFrame(np.random.rand(100, 4), index=dates, columns=['bj', 'hb', 'cd', 'hn']) # print(long_df.ix['2001-05']) # 行索引 # 带重复索引的时间序列 dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000']) dup_ts = Series(np.arange(5), index=dates) # print(dup_ts) # print(dup_ts.index.is_unique) # index不唯一 # print(dup_ts['1/3/2000']) # 索引唯一,产生标量 # print(dup_ts['1/2/2000']) # 索引不唯一,产生切片 # want:对具有非唯一时间戳的数据进行聚合:groupby,并传入level=0(索引的唯一一层) grouped = dup_ts.groupby(level=0) print(grouped.mean()) print(grouped.count())
def test_series_groupby_value_counts_on_categorical(): # GH38672 s = Series(Categorical(["a"], categories=["a", "b"])) result = s.groupby([0]).value_counts() expected = Series( data=[1, 0], index=MultiIndex.from_arrays([ [0, 0], CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=False, dtype="category"), ]), ) # Expected: # 0 a 1 # b 0 # dtype: int64 tm.assert_series_equal(result, expected)
def test_groupby_dict_mapping(self): # GH #679 from pandas import Series s = Series({'T1': 5}) result = s.groupby({'T1': 'T2'}).agg(sum) expected = s.groupby(['T2']).agg(sum) assert_series_equal(result, expected) s = Series([1., 2., 3., 4.], index=list('abcd')) mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) expected = s.groupby([0, 0, 1, 1]).mean() expected2 = s.groupby([0, 0, 1, 1]).mean() assert_series_equal(result, expected) assert_series_equal(result, result2) assert_series_equal(result, expected2)
def stats_items_dist(iter, fn_conv, name, splitter): logger.info("Stating %s", name) n2s = numberic2SignalFn(int, splitter.range_sum) volumes = [] for i in iter: iv = fn_conv(i) s = n2s(iv) volumes.append(s) cs = Series(volumes) g = cs.groupby(cs.values).agg(len) total = len(volumes) sig2prob = {} for sig, count in g.iteritems(): if sig == 0: logger.warn("%s sig %s -- %s %s %s", name, sig, count * 1.0 / total, count, total) total -= count else: logger.info("%s sig %s -- %s %s %s", name, sig, count * 1.0 / total, count, total) sig2prob[sig] = count * 1.0 / total return sig2prob
def test_groupby_dict_mapping(self): # GH #679 from pandas import Series s = Series({"T1": 5}) result = s.groupby({"T1": "T2"}).agg(sum) expected = s.groupby(["T2"]).agg(sum) tm.assert_series_equal(result, expected) s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) expected = s.groupby([0, 0, 1, 1]).mean() expected2 = s.groupby([0, 0, 1, 1]).mean() tm.assert_series_equal(result, expected) tm.assert_series_equal(result, result2) tm.assert_series_equal(result, expected2)
def data_density_filter(x, y, conv_matrix=None, min_count=5, return_figures=True): """ Use the 2D density cloud of observations to find outliers for any variables The data density filter needs tuning to work well. This uses convolution to create the density cloud - you can specify the exact convolution matrix, or its shape Parameters ---------- x : np.array / pd.Series, shape=[n, ] e.g. temperature y : np.array / pd.Series, shape=[n, ] e.g. salinity conv_matrix : int, list, np.array, optional int = size of the isotropic round convolution window. [int, int] = anisotropic (oval) convoltion window. 2d array is a weighted convolution window, where; rectangle = np.ones([int, int]); oval = gt.tools.gaussian_kernel([int, int]) > 1e-5 more advanced anisotropic windows can also be created min_count : int, default=5, optional masks the 2d histogram counts smaller than this limit when performing the convolution return_figures : bool, default=True, optional returns figures of the data plotted for blob detection... Returns ------- mask : np.array, shape=[n, ] a mask that returns only values figure : only returned if return_figure is True """ from scipy.signal import convolve2d from pandas import Series, cut from numpy import linspace, c_, where, inf, array, isnan def gaussian_kernel(*shape): """ Returns a 2D array with gaussian values to be used in the blob_outliers detection function. Can be anisotripic (oblong). Scaling is determined automatically. Parameters ---------- shape : int, int if one integer is passed the kernel will be isotropic if two integers are passed the kernel will be anisotropic Returns ------- array (float) The 2D representation of the kernel """ from matplotlib.cbook import flatten from numpy import exp, mgrid # make shape a list regardless of input shape = [int(a // 2) for a in flatten([shape])] # anisotropic if len(2) else isotropic if len(shape) == 1: sx, sy = shape[0], shape[0] elif len(shape) == 2: sx, sy = shape # create the x and y grid x, y = mgrid[-sx:sx + 1, -sy:sy + 1] sigma = [sx / 8, sy / 8] # sigma scaled by shape c = tuple([sx, sy]) # centre index of x and y g = 1 * exp(-((x - x[c])**2 / (2 * sigma[0])**2 + (y - y[c])**2 / (2 * sigma[1])**2)) return g # turning input into pandas.Series x = Series(x, name='X' if not isinstance(x, Series) else x.name) y = Series(y, name='Y' if not isinstance(y, Series) else y.name) ############### ## BINNING ## ############### # create bins for the data - equal bins xbins = linspace(x.min(), x.max(), 250) ybins = linspace(y.min(), y.max(), 250) # binning the data with pandas. This is quick to find outliers at the end xcut = cut(x, xbins, labels=c_[xbins[:-1], xbins[1:]].mean(1), right=False) ycut = cut(y, ybins, labels=c_[ybins[:-1], ybins[1:]].mean(1), right=False) # binning the data and returning as a 2D array (pandas.DataFrame) count = x.groupby([xcut, ycut]).count() count.name = 'count' # to avoid an error when unstacking count = count.unstack() count = count.sort_index().sort_index(axis=1) ################### ## CONVOLUTION ## ################### # make convolution matrix if not given if conv_matrix is None: conv_matrix = (gaussian_kernel(21) > 1e-5).astype(int) elif isinstance(conv_matrix, (list, int, float)): conv_matrix = (gaussian_kernel(conv_matrix) > 1e-5).astype(int) else: ndim = array(conv_matrix).ndim if ndim != 2: raise UserWarning('conv_matrix must have 2 dimensions') # An array with which the convolution is done # use a threshold to mask out bins with low counts # thus only dense regions of data are considered count0 = count.fillna(0).values count0[count0 < min_count] = 0 # 2d convolution with the input matrix convolved_count = convolve2d(count0, conv_matrix, mode='same') outliers = (convolved_count == 0) & ~isnan(count) cols = count.index rows = count.columns ######################################## ## FINDING OUTLIERS AND CREATE MASK ## ######################################## # find indicies of of the where there is no convolution, # but there are data. Then get the x and y values of these # points. Turn these into pairs for pandas multi-indexing. i, j = where(outliers) xi = cols[i].values yj = rows[j].values ij = list(zip(xi, yj)) # Create a pandas dataframe with the pd.cut data as indicies # with a column for a numerical index. if len(ij) > 0: idx = x.to_frame().reset_index().drop(x.name, axis=1) idx = idx.set_axis([xcut, ycut], inplace=False) idx = idx.loc[ij]['index'].values else: idx = None # create a placeholder mask and fill outliers with True mask = (x > inf).values mask[idx] = True ############### ## FIGURES ## ############### if return_figures: from numpy import ma, power, diff, sum, r_ from matplotlib import colors, cm, pyplot as plt # x and y plotting coordinates xp = cols.values.astype(float) yp = rows.values.astype(float) # plotting variables a, b, c a = ma.masked_invalid(count.T, 0) b = convolved_count.T c = ma.masked_where(a.mask, ~outliers.T) # create the figure fig, ax = plt.subplots(1, 2, figsize=[10, 5], dpi=90, sharey=True) # properties for the pcolormesh and contours pn = colors.PowerNorm(0.3) mesh_props = dict(cmap=cm.Spectral_r, norm=pn) # create the pcolormesh plots im = (ax[0].pcolormesh(xp, yp, a, vmax=a.max() / 2, **mesh_props), ax[1].pcolormesh(xp, yp, c, vmin=0, vmax=1)) ct = ax[1].contour(xp, yp, b, levels=[0.5], linestyles='-', colors='r', linewidths=2) # change figure parameters ax[0].set_title('Histogram of data (min_count = {})'.format(min_count)) ax[1].set_title( '{} Outliers found using \n{} convolution with decision boundary'. format(mask.sum(), str(conv_matrix.shape))) ax[0].set_xticks([]) ax[0].set_ylabel(y.name) ax[1].set_xlabel(x.name) # tight layout before creating the axes for pcolomesh plots fig.tight_layout() # make colorbar axes based on axes [0, 1] p = ax[0].get_position() cax = fig.add_axes([p.x0, p.y0 - 0.05, p.width, 0.04]) cb = plt.colorbar(im[0], cax=cax, orientation='horizontal') cb.set_label('Count') cb.set_ticks([1, 2, 3, 5, 10, 30, 80, 200]) # plot the min_count on the colorbar cx = pn(r_[cb.get_clim(), min_count])[-1] cb.ax.plot(cx, 0, marker='^', color='k', markersize=8) cb.ax.plot(cx, 1, marker='v', color='k', markersize=8) return mask, fig return mask
def test_resample_basic(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', name='index') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', how='mean', closed='right', label='right') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], index=date_range('1/1/2000', periods=4, freq='5min')) assert_series_equal(result, expected) self.assert_(result.index.name == 'index') result = s.resample('5min', how='mean', closed='left', label='right') expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], index=date_range('1/1/2000 00:05', periods=3, freq='5min')) assert_series_equal(result, expected) s = self.series result = s.resample('5Min', how='last') grouper = TimeGrouper(Minute(5), closed='right', label='right') expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect) # from daily dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), freq='D', name='index') s = Series(np.random.rand(len(dti)), dti) # to weekly result = s.resample('w-sun', how='last') self.assertEquals(len(result), 3) self.assert_((result.index.dayofweek == [6,6,6]).all()) self.assertEquals(result.irow(0), s['1/2/2005']) self.assertEquals(result.irow(1), s['1/9/2005']) self.assertEquals(result.irow(2), s.irow(-1)) result = s.resample('W-MON', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [0,0]).all()) self.assertEquals(result.irow(0), s['1/3/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-TUE', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [1,1]).all()) self.assertEquals(result.irow(0), s['1/4/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-WED', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [2,2]).all()) self.assertEquals(result.irow(0), s['1/5/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-THU', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [3,3]).all()) self.assertEquals(result.irow(0), s['1/6/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-FRI', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [4,4]).all()) self.assertEquals(result.irow(0), s['1/7/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) # to biz day result = s.resample('B', how='last') self.assertEquals(len(result), 6) self.assert_((result.index.dayofweek == [0,1,2,3,4,0]).all()) self.assertEquals(result.irow(0), s['1/3/2005']) self.assertEquals(result.irow(1), s['1/4/2005']) self.assertEquals(result.irow(5), s['1/10/2005']) self.assert_(result.index.name == 'index')
def test_ngroup_series_matches_frame(self): df = DataFrame({"A": list("aaaba")}) s = Series(list("aaaba")) tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) g = df.groupby('A') expected = g.sum() g = df.groupby(pd.Grouper(key='A')) result = g.sum() assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key='A', axis=0)) result = g.sum() assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame({ 'A': [0, 0, 0, 1, 1, 1], 'B': [1, 1, 2, 2, 3, 3], 'C': [1, 2, 3, 4, 5, 6] }) # Group by single column expected = df.groupby('A').sum() g = df.groupby([pd.Grouper(key='A')]) result = g.sum() assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(['A', 'B']).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(['A', pd.Grouper(key='B')]) result = g.sum() assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key='A'), 'B']) result = g.sum() assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype='int64'), index=pd.MultiIndex.from_product( [list('ab'), range(2), date_range('20130101', periods=2)], names=['one', 'two', 'three'])) result = s.groupby(pd.Grouper(level='three', freq='M')).sum() expected = Series([28], index=Index([Timestamp('2013-01-31')], freq='M', name='three')) assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level='one')).sum() expected = s.groupby(level='one').sum() assert_series_equal(result, expected)
class TestTimeGrouper(tm.TestCase): def setUp(self): self.ts = Series(np.random.randn(1000), index=date_range('1/1/2000', periods=1000)) def test_apply(self): grouper = TimeGrouper('A', label='right', closed='right') grouped = self.ts.groupby(grouper) f = lambda x: x.order()[-3:] applied = grouped.apply(f) expected = self.ts.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0) assert_series_equal(applied, expected) def test_count(self): self.ts[::3] = np.nan grouper = TimeGrouper('A', label='right', closed='right') result = self.ts.resample('A', how='count') expected = self.ts.groupby(lambda x: x.year).count() expected.index = result.index assert_series_equal(result, expected) def test_numpy_reduction(self): result = self.ts.resample('A', how='prod', closed='right') expected = self.ts.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index assert_series_equal(result, expected) def test_apply_iteration(self): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = TimeGrouper('M') _, grouper, _ = tg._get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) f = lambda df: df['close'] / df['open'] # it works! result = grouped.apply(f) self.assertTrue(result.index.equals(df.index)) def test_panel_aggregation(self): ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2, len(ind), 4) wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) _, grouper, _ = tg._get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean() def f(x): assert (isinstance(x, Panel)) return x.mean(1) result = bingrouped.agg(f) tm.assert_panel_equal(result, binagg) def test_fails_on_no_datetime_index(self): index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index', 'MultiIndex') index_funcs = (tm.makeIntIndex, tm.makePeriodIndex, tm.makeUnicodeIndex, tm.makeFloatIndex, lambda m: tm.makeCustomIndex(m, 2)) n = 2 for name, func in zip(index_names, index_funcs): index = func(n) df = DataFrame({'a': np.random.randn(n)}, index=index) with tm.assertRaisesRegexp( TypeError, "axis must be a DatetimeIndex, " "but got an instance of %r" % name): df.groupby(TimeGrouper('D'))
def test_named_agg_nametuple(self, inp): # GH34422 s = Series([1, 1, 2, 2, 3, 3, 4, 5]) msg = f"func is expected but received {type(inp).__name__}" with pytest.raises(TypeError, match=msg): s.groupby(s.values).agg(a=inp)
def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) g = df.groupby("A") expected = g.sum() g = df.groupby(pd.Grouper(key="A")) result = g.sum() tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame({ "A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6] }) # Group by single column expected = df.groupby("A").sum() g = df.groupby([pd.Grouper(key="A")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects expected = df.groupby(["A", "B"]).sum() # Group with two Grouper objects g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a string and a Grouper object g = df.groupby(["A", pd.Grouper(key="B")]) result = g.sum() tm.assert_frame_equal(result, expected) # Group with a Grouper object and a string g = df.groupby([pd.Grouper(key="A"), "B"]) result = g.sum() tm.assert_frame_equal(result, expected) # GH8866 s = Series( np.arange(8, dtype="int64"), index=pd.MultiIndex.from_product( [list("ab"), range(2), date_range("20130101", periods=2)], names=["one", "two", "three"], ), ) result = s.groupby(pd.Grouper(level="three", freq="M")).sum() expected = Series([28], index=Index([Timestamp("2013-01-31")], freq="M", name="three")) tm.assert_series_equal(result, expected) # just specifying a level breaks result = s.groupby(pd.Grouper(level="one")).sum() expected = s.groupby(level="one").sum() tm.assert_series_equal(result, expected)
class TestTimeSeriesDuplicates(object): def setup_method(self, method): dates = [ datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 3), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 4), datetime(2000, 1, 4), datetime(2000, 1, 5) ] self.dups = Series(np.random.randn(len(dates)), index=dates) def test_constructor(self): assert isinstance(self.dups, Series) assert isinstance(self.dups.index, DatetimeIndex) def test_is_unique_monotonic(self): assert not self.dups.index.is_unique def test_index_unique(self): uniques = self.dups.index.unique() expected = DatetimeIndex([ datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5) ]) assert uniques.dtype == 'M8[ns]' # sanity tm.assert_index_equal(uniques, expected) assert self.dups.index.nunique() == 4 # #2563 assert isinstance(uniques, DatetimeIndex) dups_local = self.dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() expected = DatetimeIndex(expected, name='foo') expected = expected.tz_localize('US/Eastern') assert result.tz is not None assert result.name == 'foo' tm.assert_index_equal(result, expected) # NaT, note this is excluded arr = [1370745748 + t for t in range(20)] + [tslib.iNaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21 arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21 def test_index_dupes_contains(self): d = datetime(2011, 12, 5, 20, 30) ix = DatetimeIndex([d, d]) assert d in ix def test_duplicate_dates_indexing(self): ts = self.dups uniques = ts.index.unique() for date in uniques: result = ts[date] mask = ts.index == date total = (ts.index == date).sum() expected = ts[mask] if total > 1: assert_series_equal(result, expected) else: assert_almost_equal(result, expected[0]) cp = ts.copy() cp[date] = 0 expected = Series(np.where(mask, 0, ts), index=ts.index) assert_series_equal(cp, expected) pytest.raises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) # new index ts[datetime(2000, 1, 6)] = 0 assert ts[datetime(2000, 1, 6)] == 0 def test_range_slice(self): idx = DatetimeIndex( ['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', '1/4/2000']) ts = Series(np.random.randn(len(idx)), index=idx) result = ts['1/2/2000':] expected = ts[1:] assert_series_equal(result, expected) result = ts['1/2/2000':'1/3/2000'] expected = ts[1:4] assert_series_equal(result, expected) def test_groupby_average_dup_values(self): result = self.dups.groupby(level=0).mean() expected = self.dups.groupby(self.dups.index).mean() assert_series_equal(result, expected) def test_indexing_over_size_cutoff(self): import datetime # #1821 old_cutoff = _index._SIZE_CUTOFF try: _index._SIZE_CUTOFF = 1000 # create large list of non periodic datetime dates = [] sec = datetime.timedelta(seconds=1) half_sec = datetime.timedelta(microseconds=500000) d = datetime.datetime(2011, 12, 5, 20, 30) n = 1100 for i in range(n): dates.append(d) dates.append(d + sec) dates.append(d + sec + half_sec) dates.append(d + sec + sec + half_sec) d += 3 * sec # duplicate some values in the list duplicate_positions = np.random.randint(0, len(dates) - 1, 20) for p in duplicate_positions: dates[p + 1] = dates[p] df = DataFrame(np.random.randn(len(dates), 4), index=dates, columns=list('ABCD')) pos = n * 3 timestamp = df.index[pos] assert timestamp in df.index # it works! df.loc[timestamp] assert len(df.loc[[timestamp]]) > 0 finally: _index._SIZE_CUTOFF = old_cutoff def test_indexing_unordered(self): # GH 2437 rng = date_range(start='2011-01-01', end='2011-01-15') ts = Series(np.random.rand(len(rng)), index=rng) ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) for t in ts.index: # TODO: unused? s = str(t) # noqa expected = ts[t] result = ts2[t] assert expected == result # GH 3448 (ranges) def compare(slobj): result = ts2[slobj].copy() result = result.sort_index() expected = ts[slobj] assert_series_equal(result, expected) compare(slice('2011-01-01', '2011-01-15')) compare(slice('2010-12-30', '2011-01-15')) compare(slice('2011-01-01', '2011-01-16')) # partial ranges compare(slice('2011-01-01', '2011-01-6')) compare(slice('2011-01-06', '2011-01-8')) compare(slice('2011-01-06', '2011-01-12')) # single values result = ts2['2011'].sort_index() expected = ts['2011'] assert_series_equal(result, expected) # diff freq rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.permutation(20)) result = ts['2005'] for t in result.index: assert t.year == 2005 def test_indexing(self): idx = date_range("2001-1-1", periods=20, freq='M') ts = Series(np.random.rand(len(idx)), index=idx) # getting # GH 3070, make sure semantics work on Series/Frame expected = ts['2001'] expected.name = 'A' df = DataFrame(dict(A=ts)) result = df['2001']['A'] assert_series_equal(expected, result) # setting ts['2001'] = 1 expected = ts['2001'] expected.name = 'A' df.loc['2001', 'A'] = 1 result = df['2001']['A'] assert_series_equal(expected, result) # GH3546 (not including times on the last day) idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', freq='H') ts = Series(lrange(len(idx)), index=idx) expected = ts['2013-05'] assert_series_equal(expected, ts) idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', freq='S') ts = Series(lrange(len(idx)), index=idx) expected = ts['2013-05'] assert_series_equal(expected, ts) idx = [ Timestamp('2013-05-31 00:00'), Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999)) ] ts = Series(lrange(len(idx)), index=idx) expected = ts['2013'] assert_series_equal(expected, ts) # GH14826, indexing with a seconds resolution string / datetime object df = DataFrame(np.random.rand(5, 5), columns=['open', 'high', 'low', 'close', 'volume'], index=date_range('2012-01-02 18:01:00', periods=5, tz='US/Central', freq='s')) expected = df.loc[[df.index[2]]] # this is a single date, so will raise pytest.raises( KeyError, df.__getitem__, '2012-01-02 18:01:02', ) pytest.raises( KeyError, df.__getitem__, df.index[2], )
def test_groupby_rolling_center_center(self): # GH 35552 series = Series(range(1, 6)) result = series.groupby(series).rolling(center=True, window=3).mean() expected = Series( [np.nan] * 5, index=pd.MultiIndex.from_tuples( ((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), ) tm.assert_series_equal(result, expected) series = Series(range(1, 5)) result = series.groupby(series).rolling(center=True, window=3).mean() expected = Series( [np.nan] * 4, index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), ) tm.assert_series_equal(result, expected) df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) result = df.groupby("a").rolling(center=True, window=3).mean() expected = pd.DataFrame( [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], index=pd.MultiIndex.from_tuples( ( ("a", 0), ("a", 1), ("a", 2), ("a", 3), ("a", 4), ("b", 5), ("b", 6), ("b", 7), ("b", 8), ("b", 9), ("b", 10), ), names=["a", None], ), columns=["b"], ) tm.assert_frame_equal(result, expected) df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) result = df.groupby("a").rolling(center=True, window=3).mean() expected = pd.DataFrame( [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], index=pd.MultiIndex.from_tuples( ( ("a", 0), ("a", 1), ("a", 2), ("a", 3), ("a", 4), ("b", 5), ("b", 6), ("b", 7), ("b", 8), ("b", 9), ), names=["a", None], ), columns=["b"], ) tm.assert_frame_equal(result, expected)