def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( { "date": pd.to_datetime( [ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ] ), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801, ], "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], } ).set_index("date") expected = ( df.groupby("user_id")["whole_cost"] .resample(freq) .sum(min_count=1) # XXX .dropna() .reorder_levels(["date", "user_id"]) .sort_index() .astype("int64") ) expected.name = "whole_cost" result1 = ( df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() ) tm.assert_series_equal(result1, expected) result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() tm.assert_series_equal(result2, expected)
def test_repr(): # GH18203 result = repr(Grouper(key="A", freq="H")) expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, " "closed='left', label='left', how='mean', " "convention='e', origin='start_day')") assert result == expected result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, " "closed='left', label='left', how='mean', " "convention='e', origin=Timestamp('2000-01-01 00:00:00'))") assert result == expected
def test_aggregate_with_nat_size(): # GH 9925 n = 20 data = np.random.randn(n, 4).astype('int64') normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) dt_df['key'] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4), datetime(2013, 1, 5) ] * 4 normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = normal_grouped.size() dt_result = dt_grouped.size() pad = Series([0], index=[3]) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') assert_series_equal(expected, dt_result) assert dt_result.index.name == 'key'
def test_aggregate_with_nat(func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet n = 20 data = np.random.randn(n, 4).astype('int64') normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) dt_df['key'] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4), datetime(2013, 1, 5) ] * 4 normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() pad = DataFrame([[fill_value] * 4], index=[3], columns=['A', 'B', 'C', 'D']) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') assert_frame_equal(expected, dt_result) assert dt_result.index.name == 'key'
def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" if resample_method == 'ohlc': pytest.xfail(reason='DataError: No numeric types to aggregate') data = np.random.randn(20, 4) normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) normal_df['key'] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) dt_df['key'] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5) ] * 4 normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') tm.assert_equal(expected, dt_result) # if TimeGrouper is used included, 'nth' doesn't work yet """
def test_aggregate_nth(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" data = np.random.randn(20, 4) normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) normal_df["key"] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) dt_df["key"] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5), ] * 4 normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) expected = normal_grouped.nth(3) expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") dt_result = dt_grouped.nth(3) tm.assert_frame_equal(expected, dt_result)
def test_repr(): # GH18203 result = repr(Grouper(key="A", freq="H")) expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, " "closed='left', label='left', how='mean', " "convention='e', base=0)") assert result == expected
def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_value(x): return x.value.sum() expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) )
def test_groupby_with_timegrouper_methods(self, should_sort): # GH 3881 # make sure API of timegrouper conforms df = DataFrame( { "Branch": "A A A A A B".split(), "Buyer": "Carl Mark Carl Joe Joe Carl".split(), "Quantity": [1, 3, 5, 8, 9, 3], "Date": [ datetime(2013, 1, 1, 13, 0), datetime(2013, 1, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 12, 2, 14, 0), ], } ) if should_sort: df = df.sort_values(by="Quantity", ascending=False) df = df.set_index("Date", drop=False) g = df.groupby(Grouper(freq="6M")) assert g.group_keys assert isinstance(g.grouper, BinGrouper) groups = g.groups assert isinstance(groups, dict) assert len(groups) == 3
def test_aggregate_with_nat_size(): # GH 9925 n = 20 data = np.random.randn(n, 4).astype("int64") normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) dt_df["key"] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4), datetime(2013, 1, 5), ] * 4 normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = normal_grouped.size() dt_result = dt_grouped.size() pad = Series([0], index=[3]) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range( start="2013-01-01", freq="D", periods=5, name="key" )._with_freq(None) tm.assert_series_equal(expected, dt_result) assert dt_result.index.name == "key"
def test_aggregate_with_nat(func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet n = 20 data = np.random.randn(n, 4).astype("int64") normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) dt_df["key"] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4), datetime(2013, 1, 5), ] * 4 normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) expected = normal_result.append(pad) expected = expected.sort_index() dti = date_range(start="2013-01-01", freq="D", periods=5, name="key") expected.index = dti._with_freq(None) # TODO: is this desired? tm.assert_frame_equal(expected, dt_result) assert dt_result.index.name == "key"
def test_groupby_apply_timegrouper_with_nat_apply_squeeze( self, frame_for_truncated_bingrouper ): df = frame_for_truncated_bingrouper # We need to create a GroupBy object with only one non-NaT group, # so use a huge freq so that all non-NaT dates will be grouped together tdg = Grouper(key="Date", freq="100Y") with tm.assert_produces_warning(FutureWarning, match="`squeeze` parameter"): gb = df.groupby(tdg, squeeze=True) # check that we will go through the singular_series path # in _wrap_applied_output_series assert gb.ngroups == 1 assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series res = gb.apply(lambda x: x["Quantity"] * 2) key = Timestamp("2013-12-31") ordering = df["Date"].sort_values().dropna().index mi = MultiIndex.from_product([[key], ordering], names=["Date", None]) ex_values = df["Quantity"].take(ordering).values * 2 expected = Series(ex_values, index=mi, name="Quantity") tm.assert_series_equal(res, expected)
def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" if resample_method == "ohlc": pytest.xfail(reason="DataError: No numeric types to aggregate") data = np.random.randn(20, 4) normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) normal_df["key"] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) dt_df["key"] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5), ] * 4 normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") tm.assert_equal(expected, dt_result) # if TimeGrouper is used included, 'nth' doesn't work yet """
def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name df_original = DataFrame({ "Buyer": "Carl Carl Carl Carl Joe Carl".split(), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 3, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 9, 2, 14, 0), ], }) # GH 6908 change target column's order df_reordered = df_original.sort_values(by="Quantity") for df in [df_original, df_reordered]: df = df.set_index(["Date"]) expected = DataFrame( {"Quantity": 0}, index=date_range("20130901", "20131205", freq="5D", name="Date", inclusive="left"), ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") msg = "The default value of numeric_only" with tm.assert_produces_warning(FutureWarning, match=msg): result1 = df.resample("5D").sum() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() with tm.assert_produces_warning(FutureWarning, match=msg): result2 = df_sorted.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result2, expected) with tm.assert_produces_warning(FutureWarning, match=msg): result3 = df.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result3, expected)
def test_apply_iteration(): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = Grouper(freq='M') _, grouper, _ = tg._get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) def f(df): return df['close'] / df['open'] # it works! result = grouped.apply(f) tm.assert_index_equal(result.index, df.index)
def test_fails_on_no_datetime_index(name, func): n = 2 index = func(n) df = DataFrame({"a": np.random.randn(n)}, index=index) msg = ("Only valid with DatetimeIndex, TimedeltaIndex " f"or PeriodIndex, but got an instance of '{name}'") with pytest.raises(TypeError, match=msg): df.groupby(Grouper(freq="D"))
def test_apply_iteration(): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({"open": 1, "close": 2}, index=ind) tg = Grouper(freq="M") _, grouper, _ = tg._get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) def f(df): return df["close"] / df["open"] # it works! result = grouped.apply(f) tm.assert_index_equal(result.index, df.index)
def test_resampler_is_iterable(series): # GH 15314 freq = "H" tg = Grouper(freq=freq, convention="start") grouped = series.groupby(tg) resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): assert rk == gk tm.assert_series_equal(rv, gv)
def test_count(): test_series[::3] = np.nan expected = test_series.groupby(lambda x: x.year).count() grouper = Grouper(freq="A", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index tm.assert_series_equal(result, expected) result = test_series.resample("A").count() expected.index = result.index tm.assert_series_equal(result, expected)
def test_count(): test_series[::3] = np.nan expected = test_series.groupby(lambda x: x.year).count() grouper = Grouper(freq='A', label='right', closed='right') result = test_series.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) result = test_series.resample('A').count() expected.index = result.index assert_series_equal(result, expected)
def test_custom_grouper(index): dti = index s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') b = Grouper(freq=Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) b = Grouper(freq=Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) assert g.ngroups == 2593 assert notna(g.mean()).all() # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # GH2763 - return in put dtype if we can result = g.agg(np.sum) assert_series_equal(result, expect) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64') r = df.groupby(b).agg(np.sum) assert len(r.columns) == 10 assert len(r.index) == 2593
def test_apply(): grouper = Grouper(freq="A", label="right", closed="right") grouped = test_series.groupby(grouper) def f(x): return x.sort_values()[-3:] applied = grouped.apply(f) expected = test_series.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0) tm.assert_series_equal(applied, expected)
def test_nunique_with_timegrouper_and_nat(self): # GH 17575 test = DataFrame({ "time": [ Timestamp("2016-06-28 09:35:35"), pd.NaT, Timestamp("2016-06-28 16:46:28"), ], "data": ["1", "2", "3"], }) grouper = Grouper(key="time", freq="h") result = test.groupby(grouper)["data"].nunique() expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected)
def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], 'value': [10, 13]}) df_dt = df.copy() df_dt['date'] = pd.to_datetime(df_dt['date']) def sumfunc_value(x): return x.value.sum() expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) result = (df_dt.groupby(Grouper(freq='M', key='date')) .apply(sumfunc_value)) assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True))
def test_resample_frame_basic(): df = tm.makeTimeDataFrame() b = Grouper(freq="M") g = df.groupby(b) # check all cython functions work funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) result = df.resample("A").mean() tm.assert_series_equal(result["A"], df["A"].resample("A").mean()) result = df.resample("M").mean() tm.assert_series_equal(result["A"], df["A"].resample("M").mean()) df.resample("M", kind="period").mean() df.resample("W-WED", kind="period").mean()
def test_scalar_call_versus_list_call(self): # Issue: 17530 data_frame = { "location": ["shanghai", "beijing", "shanghai"], "time": Series( ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], dtype="datetime64[ns]", ), "value": [1, 2, 3], } data_frame = DataFrame(data_frame).set_index("time") grouper = Grouper(freq="D") grouped = data_frame.groupby(grouper) result = grouped.count() grouped = data_frame.groupby([grouper]) expected = grouped.count() tm.assert_frame_equal(result, expected)
def test_resample_frame_basic(): df = tm.makeTimeDataFrame() b = Grouper(freq='M') g = df.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) result = df.resample('A').mean() assert_series_equal(result['A'], df['A'].resample('A').mean()) result = df.resample('M').mean() assert_series_equal(result['A'], df['A'].resample('M').mean()) df.resample('M', kind='period').mean() df.resample('W-WED', kind='period').mean()
def test_aaa_group_order(): # GH 12840 # check TimeGrouper perform stable sorts n = 20 data = np.random.randn(n, 4) df = DataFrame(data, columns=["A", "B", "C", "D"]) df["key"] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5), ] * 4 grouped = df.groupby(Grouper(key="key", freq="D")) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
def test_aaa_group_order(): # GH 12840 # check TimeGrouper perform stable sorts n = 20 data = np.random.randn(n, 4) df = DataFrame(data, columns=['A', 'B', 'C', 'D']) df['key'] = [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), datetime(2013, 1, 4), datetime(2013, 1, 5) ] * 4 grouped = df.groupby(Grouper(key='key', freq='D')) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5]) tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): """ GroupBy object such that gb.grouper is a BinGrouper and len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq) Aggregations on this groupby should have dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") As either the index or an index level. """ df = frame_for_truncated_bingrouper tdg = Grouper(key="Date", freq="5D") gb = df.groupby(tdg) # check we're testing the case we're interested in assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) return gb
def test_resample_ohlc(series): s = series grouper = Grouper(freq=Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) result = s.resample("5Min").ohlc() assert len(result) == len(expect) assert len(result.columns) == 4 xs = result.iloc[-2] assert xs["open"] == s[-6] assert xs["high"] == s[-6:-1].max() assert xs["low"] == s[-6:-1].min() assert xs["close"] == s[-2] xs = result.iloc[0] assert xs["open"] == s[0] assert xs["high"] == s[:5].max() assert xs["low"] == s[:5].min() assert xs["close"] == s[4]