def downsample(self, freq, closed=None, label=None, axis=0, drop_empty=True): """ Essentially use resample logic but reutrning the groupby object """ # default closed/label on offset defaults = _offset_defaults(freq) if closed is None: closed = defaults['closed'] if label is None: label = defaults['label'] tg = TimeGrouper(freq, closed=closed, label=label, axis=axis) grouper = tg.get_grouper(self) # drop empty groups. this is when we have irregular data that # we just want to group into Daily without creating empty days. if drop_empty: bins = [0] # start with 0 for np.diff bins.extend(grouper.bins) bins = np.array(bins) periods_in_bin = np.diff(bins) empty = periods_in_bin == 0 binlabels = grouper.binlabels # skip the 0 we added bins = bins[1:][~empty] binlabels = binlabels[~empty] grouper = BinGrouper(bins, binlabels) return self.groupby(grouper, axis=axis)
def resample(self, rule, how='mean', axis=0, fill_method=None, closed='right', label='right', convention=None, kind=None, loffset=None, limit=None): """ Convenience method for frequency conversion and resampling of regular time-series data. Parameters ---------- rule : the offset string or object representing target conversion how : string, method for down- or re-sampling, default 'mean' fill_method : string, fill_method for upsampling, default None axis : int, optional, default 0 closed : {'right', 'left'}, default 'right' Which side of bin interval is closed label : {'right', 'left'}, default 'right' Which bin edge label to label bucket with convention : {'start', 'end', 's', 'e'} loffset : timedelta Adjust the resampled time labels """ from pandas.tseries.resample import TimeGrouper sampler = TimeGrouper(rule, label=label, closed=closed, how=how, axis=axis, kind=kind, loffset=loffset, fill_method=fill_method, convention=convention, limit=limit) return sampler.resample(self)
def resample(self, rule, how=None, axis=0, fill_method=None, closed='right', label='right', convention=None, kind=None, loffset=None, limit=None, base=0): """ Convenience method for frequency conversion and resampling of regular time-series data. Parameters ---------- rule : the offset string or object representing target conversion how : string, method for down- or re-sampling, default to 'mean' for downsampling fill_method : string, fill_method for upsampling, default None axis : int, optional, default 0 closed : {'right', 'left'}, default 'right' Which side of bin interval is closed label : {'right', 'left'}, default 'right' Which bin edge label to label bucket with convention : {'start', 'end', 's', 'e'} loffset : timedelta Adjust the resampled time labels base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 """ from pandas.tseries.resample import TimeGrouper sampler = TimeGrouper(rule, label=label, closed=closed, how=how, axis=axis, kind=kind, loffset=loffset, fill_method=fill_method, convention=convention, limit=limit, base=base) return sampler.resample(self)
def test_panel_aggregation(self): ind = pd.date_range("1/1/2000", periods=100) data = np.random.randn(2, len(ind), 4) wp = pd.Panel(data, items=["Item1", "Item2"], major_axis=ind, minor_axis=["A", "B", "C", "D"]) tg = TimeGrouper("M", axis=1) _, grouper, _ = tg.get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean() def f(x): assert isinstance(x, Panel) return x.mean(1) result = bingrouped.agg(f) tm.assert_panel_equal(result, binagg)
def test_panel_aggregation(self): ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2, len(ind), 4) wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) grouper = tg.get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean() def f(x): assert(isinstance(x, Panel)) return x.mean(1) result = bingrouped.agg(f) tm.assert_panel_equal(result, binagg)
def test_apply_iteration(self): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = TimeGrouper('M') _, grouper, _ = tg._get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) f = lambda df: df['close'] / df['open'] # it works! result = grouped.apply(f) self.assertTrue(result.index.equals(df.index))
def test_apply_iteration(self): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = TimeGrouper('M') grouper = tg.get_grouper(df) # Errors grouped = df.groupby(grouper, group_keys=False) f = lambda df: df['close'] / df['open'] # it works! result = grouped.apply(f) self.assertTrue(result.index.equals(df.index))
def test_panelgroupby(self): def agg_func(pan): assert isinstance(pan, pd.Panel) return pan.mean() ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2,len(ind),4) wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) from pandas.tseries.resample import TimeGrouper #timegrouper tg = TimeGrouper('M', axis=1) grouper = tg.get_grouper(wp) bingrouped = wp.groupby(grouper) # Failed 12-15-12 # https://github.com/pydata/pandas/issues/2537 bingrouped.agg(agg_func)
def test_panel_aggregation(self): ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2, len(ind), 4) wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) _, grouper, _ = tg._get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean() def f(x): assert (isinstance(x, Panel)) return x.mean(1) result = bingrouped.agg(f) tm.assert_panel_equal(result, binagg)
def test_custom_grouper(self): dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1), end=datetime(2005, 1, 10)) s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) b = TimeGrouper(Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # GH2763 - return in put dtype if we can result = g.agg(np.sum) assert_series_equal(result, expect) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64') r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
def test_count(self): self.ts[::3] = np.nan grouper = TimeGrouper('A', label='right', closed='right') result = self.ts.resample('A', how='count') expected = self.ts.groupby(lambda x: x.year).count() expected.index = result.index assert_series_equal(result, expected)
def downsample(self, freq, closed=None, label=None, axis=0): """ Essentially use resample logic but reutrning the groupby object """ # default closed/label on offset defaults = _offset_defaults(freq) if closed is None: closed = defaults['closed'] if label is None: label = defaults['label'] tg = TimeGrouper(freq, closed=closed, label=label, axis=axis) grouper = tg.get_grouper(self) # TODO Get rid of empty bins? #bins = [0] #bins.extend(grouper.bins) #periods_in_bin = np.diff(bins) return self.groupby(grouper, axis=axis)
def test_apply(self): grouper = TimeGrouper('A', label='right', closed='right') grouped = self.ts.groupby(grouper) f = lambda x: x.order()[-3:] applied = grouped.apply(f) expected = self.ts.groupby(lambda x: x.year).apply(f) applied.index = applied.index.droplevel(0) expected.index = expected.index.droplevel(0) assert_series_equal(applied, expected)
def test_fails_on_no_datetime_index(self): index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index', 'MultiIndex') index_funcs = (tm.makeIntIndex, tm.makePeriodIndex, tm.makeUnicodeIndex, tm.makeFloatIndex, lambda m: tm.makeCustomIndex(m, 2)) n = 2 for name, func in zip(index_names, index_funcs): index = func(n) df = DataFrame({'a': np.random.randn(n)}, index=index) with tm.assertRaisesRegexp( TypeError, "axis must be a DatetimeIndex, " "but got an instance of %r" % name): df.groupby(TimeGrouper('D'))
def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0): """ Convenience method for frequency conversion and resampling of regular time-series data. Parameters ---------- rule : the offset string or object representing target conversion how : string, method for down- or re-sampling, default to 'mean' for downsampling axis : int, optional, default 0 fill_method : string, fill_method for upsampling, default None closed : {'right', 'left'} Which side of bin interval is closed label : {'right', 'left'} Which bin edge label to label bucket with convention : {'start', 'end', 's', 'e'} kind: "period"/"timestamp" loffset: timedelta Adjust the resampled time labels limit: int, default None Maximum size gap to when reindexing with fill_method base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 """ from pandas.tseries.resample import TimeGrouper axis = self._get_axis_number(axis) sampler = TimeGrouper(rule, label=label, closed=closed, how=how, axis=axis, kind=kind, loffset=loffset, fill_method=fill_method, convention=convention, limit=limit, base=base) return sampler.resample(self)
def aggregate_data(df, timescale, method): """Aggregate data to given timescale.""" assert timescale in ['monthly', 'seasonal'] assert method in ['sum', 'mean'] timescale_dict = {'monthly': '1M', 'seasonal': '3M'} aggregated_data = df.groupby( TimeGrouper(freq=timescale_dict[timescale], closed='left')) if method == 'sum': aggregated_data = aggregated_data.sum() elif method == 'mean': aggregated_data = aggregated_data.mean() aggregated_data = aggregated_data.drop(aggregated_data.index[-1]) return aggregated_data
def test_resample_frame_basic(self): df = tm.makeTimeDataFrame() b = TimeGrouper('M') g = df.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) result = df.resample('A') assert_series_equal(result['A'], df['A'].resample('A')) result = df.resample('M') assert_series_equal(result['A'], df['A'].resample('M')) df.resample('M', kind='period') df.resample('W-WED', kind='period')
def test_panelgroupby(self): def agg_func(pan): assert isinstance(pan, pd.Panel) return pan.mean() ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2, len(ind), 4) wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) from pandas.tseries.resample import TimeGrouper #timegrouper tg = TimeGrouper('M', axis=1) bingrouped = wp.groupby(tg) # Failed 12-15-12 # https://github.com/pydata/pandas/issues/2537 bingrouped.agg(agg_func)
def test_resample_basic(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', name='index') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', how='mean', closed='right', label='right') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], index=date_range('1/1/2000', periods=4, freq='5min')) assert_series_equal(result, expected) self.assertEqual(result.index.name, 'index') result = s.resample('5min', how='mean', closed='left', label='right') expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], index=date_range('1/1/2000 00:05', periods=3, freq='5min')) assert_series_equal(result, expected) s = self.series result = s.resample('5Min', how='last') grouper = TimeGrouper(Minute(5), closed='left', label='left') expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect)
def test_resample_ohlc(self): s = self.series grouper = TimeGrouper(Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) result = s.resample('5Min', how='ohlc') self.assertEquals(len(result), len(expect)) self.assertEquals(len(result.columns), 4) xs = result.irow(-2) self.assertEquals(xs['open'], s[-6]) self.assertEquals(xs['high'], s[-6:-1].max()) self.assertEquals(xs['low'], s[-6:-1].min()) self.assertEquals(xs['close'], s[-2]) xs = result.irow(0) self.assertEquals(xs['open'], s[0]) self.assertEquals(xs['high'], s[:5].max()) self.assertEquals(xs['low'], s[:5].min()) self.assertEquals(xs['close'], s[4])
def test_resample_ohlc(self): s = self.series grouper = TimeGrouper(Minute(5), closed='right', label='right') expect = s.groupby(grouper).agg(lambda x: x[-1]) result = s.resample('5Min', how='ohlc') self.assertEquals(len(result), len(expect)) self.assertEquals(len(result.columns), 4) xs = result.irow(-1) self.assertEquals(xs['open'], s[-5]) self.assertEquals(xs['high'], s[-5:].max()) self.assertEquals(xs['low'], s[-5:].min()) self.assertEquals(xs['close'], s[-1]) xs = result.irow(1) self.assertEquals(xs['open'], s[1]) self.assertEquals(xs['high'], s[1:6].max()) self.assertEquals(xs['low'], s[1:6].min()) self.assertEquals(xs['close'], s[5])
def sensorAggregate(oat, aggregation='mean', frequency='D', qilist=None, min_obs=None, nan_data=np.nan, nan_qi=0, closed='left', label='left', column_name=None): """ Aggregate OAT.sensor according specified parameters Args: oat (OAT.sensor): OAT.sensor object to be aggregated aggregation (str): specific aggregation options: {'max', 'min', 'mean', 'count'}, default 'mean' qilist (list): list of quality Index values to select observations used in aggregation min_obs (float): minumum number of non null values recorded in the period to calculate the aggregation (note that this percentage includes only valid qualityIndexed measures) nan_data (float): value to assign in aggregation when no or insufficient data are available nan_qi (int): value to assign in aggregation when no or insufficient data are available, closed (str): which side of bin interval is closed: {‘right’, ‘left’}, default 'left' label (str): which bin edge label to label bucket with: {‘right’, ‘left’}, default 'left' """ try: aggregations = {'num': 'count', 'data': aggregation, 'quality': 'min'} toat = oat.copy() toat.ts['num'] = 1 if qilist: toat.ts = toat.ts[(toat.ts['quality'].isin(qilist) & toat.ts['quality'].notnull())].groupby( TimeGrouper(freq=frequency, closed=closed, label=label)).agg(aggregations) else: toat.ts = toat.ts.dropna(how='any').groupby( TimeGrouper(freq=frequency, closed=closed, label=label)).agg(aggregations) toat_values = list(toat.ts.columns.values) if min_obs: if (toat.ts['num'][0] < min_obs): # assign null to non satisfactory # toat_values[toat_values.index("num")] = 0 # toat_values[toat_values.index("data")] = nan_data # toat_values[toat_values.index("quality")] = nan_qi # toat.ts[toat.ts['num'] < min_obs] = toat_values raise Exception( ("The aggregation does not satisfy the minimum" + " number of observations [%s]") % (min_obs)) except Exception as e: raise e else: # extract only data & quality toat.ts = toat.ts[['data', 'quality']] if column_name: toat.ts.rename(inplace=True, columns={'data': column_name}) toat.freq = frequency return toat
def downsample(self, freq, closed='right', label='right', axis=0): tg = TimeGrouper(freq, closed=closed, label=label) grouper = tg.get_grouper(self) return self.groupby(grouper, axis=axis)
def test_resample_basic(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', name='index') s = Series(np.random.randn(14), index=rng) result = s.resample('5min', how='mean', closed='right', label='right') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], index=date_range('1/1/2000', periods=4, freq='5min')) assert_series_equal(result, expected) self.assert_(result.index.name == 'index') result = s.resample('5min', how='mean', closed='left', label='right') expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], index=date_range('1/1/2000 00:05', periods=3, freq='5min')) assert_series_equal(result, expected) s = self.series result = s.resample('5Min', how='last') grouper = TimeGrouper(Minute(5), closed='right', label='right') expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect) # from daily dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), freq='D', name='index') s = Series(np.random.rand(len(dti)), dti) # to weekly result = s.resample('w-sun', how='last') self.assertEquals(len(result), 3) self.assert_((result.index.dayofweek == [6,6,6]).all()) self.assertEquals(result.irow(0), s['1/2/2005']) self.assertEquals(result.irow(1), s['1/9/2005']) self.assertEquals(result.irow(2), s.irow(-1)) result = s.resample('W-MON', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [0,0]).all()) self.assertEquals(result.irow(0), s['1/3/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-TUE', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [1,1]).all()) self.assertEquals(result.irow(0), s['1/4/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-WED', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [2,2]).all()) self.assertEquals(result.irow(0), s['1/5/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-THU', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [3,3]).all()) self.assertEquals(result.irow(0), s['1/6/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) result = s.resample('W-FRI', how='last') self.assertEquals(len(result), 2) self.assert_((result.index.dayofweek == [4,4]).all()) self.assertEquals(result.irow(0), s['1/7/2005']) self.assertEquals(result.irow(1), s['1/10/2005']) # to biz day result = s.resample('B', how='last') self.assertEquals(len(result), 6) self.assert_((result.index.dayofweek == [0,1,2,3,4,0]).all()) self.assertEquals(result.irow(0), s['1/3/2005']) self.assertEquals(result.irow(1), s['1/4/2005']) self.assertEquals(result.irow(5), s['1/10/2005']) self.assert_(result.index.name == 'index')
def sensorStats(oat, stat='mean', frequency='D', qilist=None, min_obs=None, nan_data=np.nan, nan_qi=0, closed='left', label='left', column_name=None): try: aggregations = { 'data': [stat, 'count'], 'quality': 'min', } toat = oat.copy() if stat == 'mean': grouped = toat.ts.dropna(how='any').groupby( TimeGrouper(freq=frequency, closed=closed, label=label)).agg(aggregations) col_list = ['data', 'count'] df1 = pd.DataFrame(data=None, columns=col_list) for i in grouped: df1['data'] = grouped[(u'data', 'mean')] df1['count'] = grouped[(u'data', 'count')] else: grouped = toat.ts.dropna(how='any').groupby( TimeGrouper(freq=frequency, closed=closed, label=label)) col_list = list(toat.ts.columns.values) col_list.append(u'time') df1 = pd.DataFrame(data=None, columns=col_list) for i in grouped: df = i[1] df.loc[:, u'time'] = df.index if not df.empty: if stat == 'max': df1.loc[i[0]] = df.loc[df['data'].idxmax()] else: df1.loc[i[0]] = df.loc[df['data'].idxmin()] toat.ts = df1 except Exception as e: raise e else: # extract only data & quality if stat == 'mean': toat.ts = df1[['data', 'count']] else: toat.ts = toat.ts[['data', 'quality', 'time']] if column_name: if stat == 'mean': toat.ts.rename(inplace=True, columns={ 'data': column_name, 'count': '{}_COUNT'.format(column_name) }) else: toat.ts.rename(inplace=True, columns={ 'data': column_name, 'time': 'TIME_' + column_name }) toat.freq = frequency return toat