def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df df = DataFrame({'FC':['a','b','a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':range(6), 'col2':range(6,12)}) df.ix[1,0]=np.nan df2 = df.copy() mask=~df2.FC.isnull() cols=['col1', 'col2'] dft = df2 * 2 dft.ix[3,3] = np.nan expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':Series([0,1,4,6,8,10]), 'col2':[12,7,16,np.nan,20,22]}) # frame on rhs df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) # with an ndarray on rhs df2 = df.copy() df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected)
def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df df = DataFrame({'FC':['a','b','a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':lrange(6), 'col2':lrange(6,12)}) df.ix[1,0]=np.nan df2 = df.copy() mask=~df2.FC.isnull() cols=['col1', 'col2'] dft = df2 * 2 dft.ix[3,3] = np.nan expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':Series([0,1,4,6,8,10]), 'col2':[12,7,16,np.nan,20,22]}) # frame on rhs df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) # with an ndarray on rhs df2 = df.copy() df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected)
def test_astype_assignment_with_iloc(self): # GH4312 df_orig = DataFrame([['1','2','3','.4',5,6.,'foo']],columns=list('ABCDEFG')) df = df_orig.copy() df.iloc[:,0:3] = df.iloc[:,0:3].astype(int) result = df.get_dtype_counts().sort_index() expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index() assert_series_equal(result,expected) df = df_orig.copy() df.iloc[:,0:3] = df.iloc[:,0:3].convert_objects(convert_numeric=True) result = df.get_dtype_counts().sort_index() expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
def test_filter_and_transform_with_non_unique_timestamp_index(self): # GH4620 t0 = Timestamp('2013-09-30 00:05:00') t1 = Timestamp('2013-10-30 00:05:00') t2 = Timestamp('2013-11-30 00:05:00') index = [t1, t1, t1, t2, t1, t1, t0, t1] df = DataFrame( { 'pid': [1, 1, 1, 2, 2, 3, 3, 3], 'tag': [23, 45, 62, 24, 45, 34, 25, 62] }, index=index) grouped_df = df.groupby('tag') ser = df['pid'] grouped_ser = ser.groupby(df['tag']) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') # ^ made manually because this can get confusing! assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index(self): # GH4620 t0 = Timestamp('2013-09-30 00:05:00') t1 = Timestamp('2013-10-30 00:05:00') t2 = Timestamp('2013-11-30 00:05:00') index = [t1, t1, t1, t2, t1, t1, t0, t1] df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) grouped_df = df.groupby('tag') ser = df['pid'] grouped_ser = ser.groupby(df['tag']) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') # ^ made manually because this can get confusing! assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) assert_series_equal(actual, expected)
def test_ix_assign_column_mixed(self): # GH #1142 df = DataFrame(tm.getSeriesData()) df['foo'] = 'bar' orig = df.ix[:, 'B'].copy() df.ix[:, 'B'] = df.ix[:, 'B'] + 1 assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value df = DataFrame({'x':range(10), 'y':range(10,20),'z' : 'bar'}) expected = df.copy() expected.ix[0, 'y'] = 1000 expected.ix[2, 'y'] = 1200 expected.ix[4, 'y'] = 1400 expected.ix[6, 'y'] = 1600 expected.ix[8, 'y'] = 1800 df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 assert_frame_equal(df,expected)
def test_ix_assign_column_mixed(self): # GH #1142 df = DataFrame(tm.getSeriesData()) df['foo'] = 'bar' orig = df.ix[:, 'B'].copy() df.ix[:, 'B'] = df.ix[:, 'B'] + 1 assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value df = DataFrame({'x':range(10), 'y':range(10,20),'z' : 'bar'}) expected = df.copy() expected.ix[0, 'y'] = 1000 expected.ix[2, 'y'] = 1200 expected.ix[4, 'y'] = 1400 expected.ix[6, 'y'] = 1600 expected.ix[8, 'y'] = 1800 df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 assert_frame_equal(df,expected)
def test_ix_assign_column_mixed(self): # GH #1142 df = DataFrame(tm.getSeriesData()) df['foo'] = 'bar' orig = df.ix[:, 'B'].copy() df.ix[:, 'B'] = df.ix[:, 'B'] + 1 assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value df = DataFrame({'x':lrange(10), 'y':lrange(10,20),'z' : 'bar'}) expected = df.copy() for i in range(5): indexer = i*2 v = 1000 + i*200 expected.ix[indexer, 'y'] = v self.assert_(expected.ix[indexer, 'y'] == v) df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 assert_frame_equal(df,expected) # GH 4508, making sure consistency of assignments df = DataFrame({'a':[1,2,3],'b':[0,1,2]}) df.ix[[0,2,],'b'] = [100,-100] expected = DataFrame({'a' : [1,2,3], 'b' : [100,1,-100] }) assert_frame_equal(df,expected) df = pd.DataFrame({'a': lrange(4) }) df['b'] = np.nan df.ix[[1,3],'b'] = [100,-100] expected = DataFrame({'a' : [0,1,2,3], 'b' : [np.nan,100,np.nan,-100] }) assert_frame_equal(df,expected) # ok, but chained assignments are dangerous df = pd.DataFrame({'a': lrange(4) }) df['b'] = np.nan df['b'].ix[[1,3]] = [100,-100] assert_frame_equal(df,expected)
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]]) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.toString(buf=buf) self.ymd.toString(buf=buf) self.frame.T.toString(buf=buf) self.ymd.T.toString(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.delevel()['A'].sortlevel) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() def test_alignment(self): pass
class TestGroupByAggregate(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) self.df_mixed_floats = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32') }) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) }) def test_agg_api(self): # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({ 'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'] }) grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = grouped.agg(peak_to_peak) assert_frame_equal(result, expected) def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) def test_agg_datetimes_mixed(self): data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] df1 = DataFrame({ 'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data] }) data = [[ row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None, row[2] ] for row in data] df2 = DataFrame({ 'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data] }) df1['weights'] = df1['value'] / df1['value'].sum() gb1 = df1.groupby('date').aggregate(np.sum) df2['weights'] = df1['value'] / df1['value'].sum() gb2 = df2.groupby('date').aggregate(np.sum) assert (len(gb1) == len(gb2)) def test_agg_period_index(self): from pandas import period_range, PeriodIndex prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() tm.assertIsInstance(rs.index, PeriodIndex) # GH 3579 index = period_range(start='1999-01', periods=5, freq='M') s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] df = DataFrame.from_items(series) grouped = df.groupby(df.index.month) list(grouped) def test_agg_dict_parameter_cast_result_dtypes(self): # GH 12821 df = DataFrame({ 'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], 'time': date_range('1/1/2011', periods=8, freq='H') }) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function exp = df.loc[[0, 3, 4, 6]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.first(), exp) assert_frame_equal(grouped.agg('first'), exp) assert_frame_equal(grouped.agg({'time': 'first'}), exp) assert_series_equal(grouped.time.first(), exp['time']) assert_series_equal(grouped.time.agg('first'), exp['time']) # test for `last` function exp = df.loc[[0, 3, 4, 7]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.last(), exp) assert_frame_equal(grouped.agg('last'), exp) assert_frame_equal(grouped.agg({'time': 'last'}), exp) assert_series_equal(grouped.time.last(), exp['time']) assert_series_equal(grouped.time.agg('last'), exp['time']) def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) def test_agg_ser_multi_key(self): # TODO(wesm): unused ser = self.df.C # noqa f = lambda x: x.sum() results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) expected = self.df.groupby(['A', 'B']).sum()['C'] assert_series_equal(results, expected) def test_agg_apply_corner(self): # nothing to group, all NA grouped = self.ts.groupby(self.ts * np.nan) self.assertEqual(self.ts.dtype, np.float64) # groupby float64 values results in Float64Index exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) assert_series_equal(grouped.sum(), exp) assert_series_equal(grouped.agg(np.sum), exp) assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)) assert_frame_equal(grouped.sum(), exp_df, check_names=False) assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping df = tm.makeTimeDataFrame() grouped = df.groupby(lambda x: x.year) grouper = grouped.grouper.groupings[0].grouper grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_aggregate_api_consistency(self): # GH 9052 # make sure that the aggregates via dict # are consistent df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8) }) grouped = df.groupby(['A', 'B']) c_mean = grouped['C'].mean() c_sum = grouped['C'].sum() d_mean = grouped['D'].mean() d_sum = grouped['D'].sum() result = grouped['D'].agg(['sum', 'mean']) expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) expected = pd.concat([d_sum, c_mean], axis=1) assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) assert_frame_equal(result, expected, check_like=True) def test_agg_compat(self): # GH 12334 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8) }) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(self): # API change for disallowing these types of nested dicts df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8) }) g = df.groupby(['A', 'B']) def f(): g.aggregate({ 'r1': { 'C': ['mean', 'sum'] }, 'r2': { 'D': ['mean', 'sum'] } }) self.assertRaises(SpecificationError, f) result = g.agg({ 'C': { 'ra': ['mean', 'std'] }, 'D': { 'rb': ['mean', 'std'] } }) expected = pd.concat( [g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ('ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) result = g['D'].agg({'D': np.sum, 'result2': np.mean}) assert_frame_equal(result, expected, check_like=True) def test_agg_python_multiindex(self): grouped = self.mframe.groupby(['A', 'B']) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_aggregate_str_func(self): def _check_results(grouped): # single series result = grouped['A'].agg('std') expected = grouped['A'].std() assert_series_equal(result, expected) # group frame by function name result = grouped.aggregate('var') expected = grouped.var() assert_frame_equal(result, expected) # group frame by function dict result = grouped.agg( OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean'], ['D', 'sem']])) expected = DataFrame( OrderedDict([['A', grouped['A'].var()], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], ['D', grouped['D'].sem()]])) assert_frame_equal(result, expected) by_weekday = self.tsframe.groupby(lambda x: x.weekday()) _check_results(by_weekday) by_mwkday = self.tsframe.groupby( [lambda x: x.month, lambda x: x.weekday()]) _check_results(by_mwkday) def test_aggregate_item_by_item(self): df = self.df.copy() df['E'] = ['a'] * len(self.df) grouped = self.df.groupby('A') # API change in 0.11 # def aggfun(ser): # return len(ser + 'a') # result = grouped.agg(aggfun) # self.assertEqual(len(result.columns), 1) aggfun = lambda ser: ser.size result = grouped.agg(aggfun) foo = (self.df.A == 'foo').sum() bar = (self.df.A == 'bar').sum() K = len(result.columns) # GH5782 # odd comparisons can result here, so cast to make easy exp = pd.Series(np.array([foo] * K), index=list('BCD'), dtype=np.float64, name='foo') tm.assert_series_equal(result.xs('foo'), exp) exp = pd.Series(np.array([bar] * K), index=list('BCD'), dtype=np.float64, name='bar') tm.assert_almost_equal(result.xs('bar'), exp) def aggfun(ser): return ser.size result = DataFrame().groupby(self.df.A).agg(aggfun) tm.assertIsInstance(result, DataFrame) self.assertEqual(len(result), 0) def test_agg_item_by_item_raise_typeerror(self): from numpy.random import randint df = DataFrame(randint(10, size=(20, 10))) def raiseException(df): pprint_thing('----------------------------------------') pprint_thing(df.to_string()) raise TypeError self.assertRaises(TypeError, df.groupby(0).agg, raiseException) def test_series_agg_multikey(self): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.sum) expected = grouped.sum() assert_series_equal(result, expected) def test_series_agg_multi_pure_python(self): data = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) }) def bad(x): assert (len(x.base) > 0) return 'foo' result = data.groupby(['A', 'B']).agg(bad) expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') assert_frame_equal(result, expected)
_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') _frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') _mixed = DataFrame({'A': _frame['A'].copy(), 'B': _frame['B'].astype('float32'), 'C': _frame['C'].astype('int64'), 'D': _frame['D'].astype('int32')}) _mixed2 = DataFrame({'A': _frame2['A'].copy(), 'B': _frame2['B'].astype('float32'), 'C': _frame2['C'].astype('int64'), 'D': _frame2['D'].astype('int32')}) _integer = DataFrame( np.random.randint(1, 100, size=(10001, 4)), columns=list('ABCD'), dtype='int64') _integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), columns=list('ABCD'), dtype='int64') _frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=( _frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy())) _frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3), ItemC=_frame2.copy(), ItemD=_frame2.copy())) _integer_panel = Panel(dict(ItemA=_integer, ItemB=(_integer + 34).astype( 'int64'))) _integer2_panel = Panel(dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype( 'int64'))) _mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) class TestExpressions(tm.TestCase): _multiprocess_can_split_ = False
class PandasGroupby(SparklingPandasTestCase): def setUp(self): """ Setup the dataframes used for the groupby tests derived from pandas """ self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) self.groupId = Series([x[0] for x in self.stringIndex], index=self.stringIndex) self.groupDict = dict( (k, v) for k, v in compat.iteritems(self.groupId)) self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) randMat = np.random.randn(250, 5) self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.stringIndex) self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.dateRange) self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) self.df_mixed_floats = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32') }) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) }) super(self.__class__, self).setUp() def test_first_last_nth(self): # tests for first / last / nth ddf = self.psc.from_data_frame(self.df) assert_frame_equal(ddf.collect(), self.df) grouped = self.psc.from_data_frame(self.df).groupby('A') first = grouped.first().collect() expected = self.df.ix[[1, 0], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) nth = grouped.nth(0).collect() assert_frame_equal(nth, expected) last = grouped.last().collect() expected = self.df.ix[[5, 7], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') assert_frame_equal(last, expected) nth = grouped.nth(-1).collect() assert_frame_equal(nth, expected) nth = grouped.nth(1).collect() expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy() expected.index = Index(['foo', 'bar'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) @unittest2.expectedFailure def test_getitem(self): # it works! grouped['B'].first() grouped['B'].last() grouped['B'].nth(0) self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assertTrue(com.isnull(grouped['B'].first()['foo'])) self.assertTrue(com.isnull(grouped['B'].last()['foo'])) # not sure what this is testing self.assertTrue(com.isnull(grouped['B'].nth(0)[0])) @unittest2.expectedFailure def test_new_in0140(self): """ Test new functionality in 0.14.0. This currently doesn't work. """ # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) ddf = self.psc.from_data_frame(df) g = ddf.groupby('A') result = g.first().collect() expected = df.iloc[[1, 2]].set_index('A') assert_frame_equal(result, expected) expected = df.iloc[[1, 2]].set_index('A') result = g.nth(0, dropna='any').collect() assert_frame_equal(result, expected) @unittest2.expectedFailure def test_first_last_nth_dtypes(self): """ We do groupby fine on mixed types, but our copy from local dataframe ends up re-running the guess type function, so the dtypes don't match. Issue #25 """ df = self.df_mixed_floats.copy() df['E'] = True df['F'] = 1 # tests for first / last / nth grouped = ddf.groupby('A') first = grouped.first().collect() expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last().collect() expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1).collect() expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) def test_var_on_multiplegroups(self): df = DataFrame({ 'data1': np.random.randn(5), 'data2': np.random.randn(5), 'data3': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'] }) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby(['key1', 'key2']) grouped = df.groupby(['key1', 'key2']) assert_frame_equal(dgrouped.var().collect(), grouped.var()) def test_agg_api(self): # Note: needs a very recent version of pandas to pass # TODO(holden): Pass this test if local fails # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({ 'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'] }) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby('key1') grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = dgrouped.agg(peak_to_peak).collect() assert_frame_equal(result, expected) def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) dgrouped = self.psc.from_data_frame(self.tsframe).groupby( [lambda x: x.year, lambda x: x.month]) result = dgrouped.agg(np.mean).collect() expected = grouped.agg(np.mean) assert_frame_equal(result, expected)
class TestMoments(unittest.TestCase): _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = DateRange(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) def test_rolling_sum(self): self._check_moment_func(mom.rolling_sum, np.sum) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() self._check_moment_func(mom.rolling_count, counter, has_min_periods=False, preserve_nan=False) def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) def test_rolling_min(self): self._check_moment_func(mom.rolling_min, np.min) def test_rolling_max(self): self._check_moment_func(mom.rolling_max, np.max) def test_rolling_quantile(self): qs = [.1, .5, .9] def scoreatpercentile(a, per): values = np.sort(a,axis=0) idx = per /1. * (values.shape[0] - 1) return values[int(idx)] for q in qs: def f(x, window, min_periods=None, time_rule=None): return mom.rolling_quantile(x, window, q, min_periods=min_periods, time_rule=time_rule) def alt(x): return scoreatpercentile(x, q) self._check_moment_func(f, alt) def test_rolling_apply(self): def roll_mean(x, window, min_periods=None, time_rule=None): return mom.rolling_apply(x, window, lambda x: x[np.isfinite(x)].mean(), min_periods=min_periods, time_rule=time_rule) self._check_moment_func(roll_mean, np.mean) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1)) def test_rolling_var(self): self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1)) def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False)) def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False)) def _check_moment_func(self, func, static_comp, window=50, has_min_periods=True, has_time_rule=True, preserve_nan=True): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan) self._check_structures(func, static_comp, has_min_periods=has_min_periods, has_time_rule=has_time_rule) def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True): result = func(self.arr, window) assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN if has_min_periods: result = func(arr, 50, min_periods=30) assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = func(arr, 20, min_periods=15) self.assert_(np.isnan(result[23])) self.assert_(not np.isnan(result[24])) self.assert_(not np.isnan(result[-6])) self.assert_(np.isnan(result[-5])) # min_periods=0 result0 = func(arr, 20, min_periods=0) result1 = func(arr, 20, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr, 50) assert_almost_equal(result[-1], static_comp(arr[10:-10])) def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True): series_result = func(self.series, 50) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, 50) self.assertEquals(type(frame_result), DataFrame) # check time_rule works if has_time_rule: win = 25 minp = 10 if has_min_periods: series_result = func(self.series[::2], win, min_periods=minp, time_rule='WEEKDAY') frame_result = func(self.frame[::2], win, min_periods=minp, time_rule='WEEKDAY') else: series_result = func(self.series[::2], win, time_rule='WEEKDAY') frame_result = func(self.frame[::2], win, time_rule='WEEKDAY') last_date = series_result.index[-1] prev_date = last_date - 24 * datetools.bday trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) assert_almost_equal(series_result[-1], static_comp(trunc_series)) assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) def test_ewma(self): self._check_ew(mom.ewma) def test_ewmvar(self): self._check_ew(mom.ewmvar) def test_ewmvol(self): self._check_ew(mom.ewmvol) def test_ewma_span_com_args(self): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) assert_almost_equal(A, B) self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(Exception, mom.ewma, self.arr) def _check_ew(self, func): self._check_ew_ndarray(func) self._check_ew_structures(func) def _check_ew_ndarray(self, func, preserve_nan=False): result = func(self.arr, com=10) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN # ??? check something # pass in ints result2 = func(np.arange(50), span=10) self.assert_(result.dtype == np.float_) def _check_ew_structures(self, func): series_result = func(self.series, com=10) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, com=10) self.assertEquals(type(frame_result), DataFrame) # binary moments def test_rolling_cov(self): A = self.series B = A + randn(len(A)) result = mom.rolling_cov(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_corr(self): A = self.series B = A + randn(len(A)) result = mom.rolling_corr(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() b = tm.makeTimeSeries() a[:5] = np.nan b[:10] = np.nan result = mom.rolling_corr(a, b, len(a), min_periods=1) assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) correl = panel.ix[:, 1, 5] exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5) tm.assert_series_equal(correl, exp) def test_flex_binary_frame(self): def _check(method): series = self.frame[1] res = method(series, self.frame, 10) res2 = method(self.frame, series, 10) exp = self.frame.apply(lambda x: method(series, x, 10)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = self.frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = method(self.frame, frame2, 10) exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10)) for k in self.frame)) tm.assert_frame_equal(res3, exp) methods = [mom.rolling_corr, mom.rolling_cov] for meth in methods: _check(meth) def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) B = A[2:] + randn(48) A[:10] = np.NaN B[-10:] = np.NaN result = func(A, B, 20, min_periods=5) self.assert_(np.isnan(result[:15]).all()) self.assert_(not np.isnan(result[15:]).any()) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
def test_partial_setting(self): # GH2578, allow ix and friends to partially set ### series ### s_orig = Series([1,2,3]) s = s_orig.copy() s[5] = 5 expected = Series([1,2,3,5],index=[0,1,2,5]) assert_series_equal(s,expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1,2,3,5],index=[0,1,2,5]) assert_series_equal(s,expected) s = s_orig.copy() s[5] = 5. expected = Series([1,2,3,5.],index=[0,1,2,5]) assert_series_equal(s,expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1,2,3,5.],index=[0,1,2,5]) assert_series_equal(s,expected) # iloc/iat raise s = s_orig.copy() def f(): s.iloc[3] = 5. self.assertRaises(IndexError, f) def f(): s.iat[3] = 5. self.assertRaises(IndexError, f) ### frame ### df_orig = DataFrame(np.arange(6).reshape(3,2),columns=['A','B']) # iloc/iat raise df = df_orig.copy() def f(): df.iloc[4,2] = 5. self.assertRaises(IndexError, f) def f(): df.iat[4,2] = 5. self.assertRaises(IndexError, f) # row setting where it exists expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] })) df = df_orig.copy() df.iloc[1] = df.iloc[2] assert_frame_equal(df,expected) expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] })) df = df_orig.copy() df.loc[1] = df.loc[2] assert_frame_equal(df,expected) expected = DataFrame(dict({ 'A' : [0,2,4,4], 'B' : [1,3,5,5] }),dtype='float64') df = df_orig.copy() df.loc[3] = df.loc[2] assert_frame_equal(df,expected) # single dtype frame, overwrite expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : [0,2,4] })) df = df_orig.copy() df.ix[:,'B'] = df.ix[:,'A'] assert_frame_equal(df,expected) # mixed dtype frame, overwrite expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : Series([0.,2.,4.]) })) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) df.ix[:,'B'] = df.ix[:,'A'] assert_frame_equal(df,expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'].astype(np.float64) df = df_orig.copy() df.ix[:,'C'] = df.ix[:,'A'] assert_frame_equal(df,expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'].astype(np.float64) df = df_orig.copy() df.ix[:,'C'] = df.ix[:,'A'] assert_frame_equal(df,expected) ### panel ### p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64') # panel setting via item p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] assert_panel_equal(p,expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2,1,0) expected['C'] = DataFrame({ 'Item1' : [30,30,30,30], 'Item2' : [32,32,32,32] },index=p_orig.major_axis) expected = expected.transpose(2,1,0) p = p_orig.copy() p.loc[:,:,'C'] = Series([30,32],index=p_orig.items) assert_panel_equal(p,expected)
class TestGroupByAggregate(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame( {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) self.df_mixed_floats = DataFrame( {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array( np.random.randn(8), dtype='float32')}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame( {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) def test_agg_api(self): # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = grouped.agg(peak_to_peak) assert_frame_equal(result, expected) def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) def test_agg_datetimes_mixed(self): data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] df1 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None, row[2]] for row in data] df2 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) df1['weights'] = df1['value'] / df1['value'].sum() gb1 = df1.groupby('date').aggregate(np.sum) df2['weights'] = df1['value'] / df1['value'].sum() gb2 = df2.groupby('date').aggregate(np.sum) assert (len(gb1) == len(gb2)) def test_agg_period_index(self): from pandas import period_range, PeriodIndex prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() tm.assertIsInstance(rs.index, PeriodIndex) # GH 3579 index = period_range(start='1999-01', periods=5, freq='M') s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] df = DataFrame.from_items(series) grouped = df.groupby(df.index.month) list(grouped) def test_agg_dict_parameter_cast_result_dtypes(self): # GH 12821 df = DataFrame( {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], 'time': date_range('1/1/2011', periods=8, freq='H')}) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function exp = df.loc[[0, 3, 4, 6]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.first(), exp) assert_frame_equal(grouped.agg('first'), exp) assert_frame_equal(grouped.agg({'time': 'first'}), exp) assert_series_equal(grouped.time.first(), exp['time']) assert_series_equal(grouped.time.agg('first'), exp['time']) # test for `last` function exp = df.loc[[0, 3, 4, 7]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.last(), exp) assert_frame_equal(grouped.agg('last'), exp) assert_frame_equal(grouped.agg({'time': 'last'}), exp) assert_series_equal(grouped.time.last(), exp['time']) assert_series_equal(grouped.time.agg('last'), exp['time']) def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) def test_agg_ser_multi_key(self): # TODO(wesm): unused ser = self.df.C # noqa f = lambda x: x.sum() results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) expected = self.df.groupby(['A', 'B']).sum()['C'] assert_series_equal(results, expected) def test_agg_apply_corner(self): # nothing to group, all NA grouped = self.ts.groupby(self.ts * np.nan) self.assertEqual(self.ts.dtype, np.float64) # groupby float64 values results in Float64Index exp = Series([], dtype=np.float64, index=pd.Index( [], dtype=np.float64)) assert_series_equal(grouped.sum(), exp) assert_series_equal(grouped.agg(np.sum), exp) assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)) assert_frame_equal(grouped.sum(), exp_df, check_names=False) assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping df = tm.makeTimeDataFrame() grouped = df.groupby(lambda x: x.year) grouper = grouped.grouper.groupings[0].grouper grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_aggregate_api_consistency(self): # GH 9052 # make sure that the aggregates via dict # are consistent df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) grouped = df.groupby(['A', 'B']) c_mean = grouped['C'].mean() c_sum = grouped['C'].sum() d_mean = grouped['D'].mean() d_sum = grouped['D'].sum() result = grouped['D'].agg(['sum', 'mean']) expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) expected = pd.concat([d_sum, c_mean], axis=1) assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) assert_frame_equal(result, expected, check_like=True) def test_agg_compat(self): # GH 12334 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(self): # API change for disallowing these types of nested dicts df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) def f(): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) self.assertRaises(SpecificationError, f) result = g.agg({'C': {'ra': ['mean', 'std']}, 'D': {'rb': ['mean', 'std']}}) expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) result = g['D'].agg({'D': np.sum, 'result2': np.mean}) assert_frame_equal(result, expected, check_like=True) def test_agg_python_multiindex(self): grouped = self.mframe.groupby(['A', 'B']) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_aggregate_str_func(self): def _check_results(grouped): # single series result = grouped['A'].agg('std') expected = grouped['A'].std() assert_series_equal(result, expected) # group frame by function name result = grouped.aggregate('var') expected = grouped.var() assert_frame_equal(result, expected) # group frame by function dict result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean'], ['D', 'sem']])) expected = DataFrame(OrderedDict([['A', grouped['A'].var( )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], ['D', grouped['D'].sem()]])) assert_frame_equal(result, expected) by_weekday = self.tsframe.groupby(lambda x: x.weekday()) _check_results(by_weekday) by_mwkday = self.tsframe.groupby([lambda x: x.month, lambda x: x.weekday()]) _check_results(by_mwkday) def test_aggregate_item_by_item(self): df = self.df.copy() df['E'] = ['a'] * len(self.df) grouped = self.df.groupby('A') # API change in 0.11 # def aggfun(ser): # return len(ser + 'a') # result = grouped.agg(aggfun) # self.assertEqual(len(result.columns), 1) aggfun = lambda ser: ser.size result = grouped.agg(aggfun) foo = (self.df.A == 'foo').sum() bar = (self.df.A == 'bar').sum() K = len(result.columns) # GH5782 # odd comparisons can result here, so cast to make easy exp = pd.Series(np.array([foo] * K), index=list('BCD'), dtype=np.float64, name='foo') tm.assert_series_equal(result.xs('foo'), exp) exp = pd.Series(np.array([bar] * K), index=list('BCD'), dtype=np.float64, name='bar') tm.assert_almost_equal(result.xs('bar'), exp) def aggfun(ser): return ser.size result = DataFrame().groupby(self.df.A).agg(aggfun) tm.assertIsInstance(result, DataFrame) self.assertEqual(len(result), 0) def test_agg_item_by_item_raise_typeerror(self): from numpy.random import randint df = DataFrame(randint(10, size=(20, 10))) def raiseException(df): pprint_thing('----------------------------------------') pprint_thing(df.to_string()) raise TypeError self.assertRaises(TypeError, df.groupby(0).agg, raiseException) def test_series_agg_multikey(self): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.sum) expected = grouped.sum() assert_series_equal(result, expected) def test_series_agg_multi_pure_python(self): data = DataFrame( {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) def bad(x): assert (len(x.base) > 0) return 'foo' result = data.groupby(['A', 'B']).agg(bad) expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') assert_frame_equal(result, expected)
'D': _frame['D'].astype('int32') }) _mixed2 = DataFrame({ 'A': _frame2['A'].copy(), 'B': _frame2['B'].astype('float32'), 'C': _frame2['C'].astype('int64'), 'D': _frame2['D'].astype('int32') }) _integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns=list('ABCD'), dtype='int64') _integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), columns=list('ABCD'), dtype='int64') _frame_panel = Panel( dict(ItemA=_frame.copy(), ItemB=(_frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy())) _frame2_panel = Panel( dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3), ItemC=_frame2.copy(), ItemD=_frame2.copy())) _integer_panel = Panel( dict(ItemA=_integer, ItemB=(_integer + 34).astype('int64'))) _integer2_panel = Panel( dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype('int64'))) _mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))
class TestMoments(unittest.TestCase): _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = DateRange(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) def test_rolling_sum(self): self._check_moment_func(mom.rolling_sum, np.sum) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() self._check_moment_func(mom.rolling_count, counter, has_min_periods=False, preserve_nan=False) def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) def test_rolling_min(self): self._check_moment_func(mom.rolling_min, np.min) def test_rolling_max(self): self._check_moment_func(mom.rolling_max, np.max) def test_rolling_quantile(self): qs = [.1, .5, .9] def scoreatpercentile(a, per): values = np.sort(a, axis=0) idx = per / 1. * (values.shape[0] - 1) return values[int(idx)] for q in qs: def f(x, window, min_periods=None, time_rule=None): return mom.rolling_quantile(x, window, q, min_periods=min_periods, time_rule=time_rule) def alt(x): return scoreatpercentile(x, q) self._check_moment_func(f, alt) def test_rolling_apply(self): def roll_mean(x, window, min_periods=None, time_rule=None): return mom.rolling_apply(x, window, lambda x: x[np.isfinite(x)].mean(), min_periods=min_periods, time_rule=time_rule) self._check_moment_func(roll_mean, np.mean) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1)) def test_rolling_var(self): self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1)) def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False)) def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False)) def _check_moment_func(self, func, static_comp, window=50, has_min_periods=True, has_time_rule=True, preserve_nan=True): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan) self._check_structures(func, static_comp, has_min_periods=has_min_periods, has_time_rule=has_time_rule) def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True): result = func(self.arr, window) assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN if has_min_periods: result = func(arr, 50, min_periods=30) assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = func(arr, 20, min_periods=15) self.assert_(np.isnan(result[23])) self.assert_(not np.isnan(result[24])) self.assert_(not np.isnan(result[-6])) self.assert_(np.isnan(result[-5])) # min_periods=0 result0 = func(arr, 20, min_periods=0) result1 = func(arr, 20, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr, 50) assert_almost_equal(result[-1], static_comp(arr[10:-10])) def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True): series_result = func(self.series, 50) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, 50) self.assertEquals(type(frame_result), DataFrame) # check time_rule works if has_time_rule: win = 25 minp = 10 if has_min_periods: series_result = func(self.series[::2], win, min_periods=minp, time_rule='WEEKDAY') frame_result = func(self.frame[::2], win, min_periods=minp, time_rule='WEEKDAY') else: series_result = func(self.series[::2], win, time_rule='WEEKDAY') frame_result = func(self.frame[::2], win, time_rule='WEEKDAY') last_date = series_result.index[-1] prev_date = last_date - 24 * datetools.bday trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) assert_almost_equal(series_result[-1], static_comp(trunc_series)) assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) def test_ewma(self): self._check_ew(mom.ewma) def test_ewmvar(self): self._check_ew(mom.ewmvar) def test_ewmvol(self): self._check_ew(mom.ewmvol) def test_ewma_span_com_args(self): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) assert_almost_equal(A, B) self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(Exception, mom.ewma, self.arr) def _check_ew(self, func): self._check_ew_ndarray(func) self._check_ew_structures(func) def _check_ew_ndarray(self, func, preserve_nan=False): result = func(self.arr, com=10) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN # ??? check something # pass in ints result2 = func(np.arange(50), span=10) self.assert_(result.dtype == np.float_) def _check_ew_structures(self, func): series_result = func(self.series, com=10) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, com=10) self.assertEquals(type(frame_result), DataFrame) # binary moments def test_rolling_cov(self): A = self.series B = A + randn(len(A)) result = mom.rolling_cov(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_corr(self): A = self.series B = A + randn(len(A)) result = mom.rolling_corr(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() b = tm.makeTimeSeries() a[:5] = np.nan b[:10] = np.nan result = mom.rolling_corr(a, b, len(a), min_periods=1) assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) correl = panel.ix[:, 1, 5] exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5) tm.assert_series_equal(correl, exp) def test_flex_binary_frame(self): def _check(method): series = self.frame[1] res = method(series, self.frame, 10) res2 = method(self.frame, series, 10) exp = self.frame.apply(lambda x: method(series, x, 10)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = self.frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = method(self.frame, frame2, 10) exp = DataFrame( dict((k, method(self.frame[k], frame2[k], 10)) for k in self.frame)) tm.assert_frame_equal(res3, exp) methods = [mom.rolling_corr, mom.rolling_cov] for meth in methods: _check(meth) def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) B = A[2:] + randn(48) A[:10] = np.NaN B[-10:] = np.NaN result = func(A, B, 20, min_periods=5) self.assert_(np.isnan(result.values[:15]).all()) self.assert_(not np.isnan(result.values[15:]).any()) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
class PandasGroupby(SparklingPandasTestCase): def setUp(self): """ Setup the dataframes used for the groupby tests derived from pandas """ self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) self.groupId = Series([x[0] for x in self.stringIndex], index=self.stringIndex) self.groupDict = dict((k, v) for k, v in compat.iteritems(self.groupId)) self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) randMat = np.random.randn(250, 5) self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.stringIndex) self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.dateRange) self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32')}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) super(self.__class__, self).setUp() def test_first_last_nth(self): # tests for first / last / nth ddf = self.psc.from_data_frame(self.df) assert_frame_equal(ddf.collect(), self.df) grouped = self.psc.from_data_frame(self.df).groupby('A') first = grouped.first().collect() expected = self.df.ix[[1, 0], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) nth = grouped.nth(0).collect() assert_frame_equal(nth, expected) last = grouped.last().collect() expected = self.df.ix[[5, 7], ['B', 'C', 'D']] expected.index = Index(['bar', 'foo'], name='A') assert_frame_equal(last, expected) nth = grouped.nth(-1).collect() assert_frame_equal(nth, expected) nth = grouped.nth(1).collect() expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy() expected.index = Index(['foo', 'bar'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) @unittest2.expectedFailure def test_getitem(self): # it works! grouped['B'].first() grouped['B'].last() grouped['B'].nth(0) self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assertTrue(com.isnull(grouped['B'].first()['foo'])) self.assertTrue(com.isnull(grouped['B'].last()['foo'])) # not sure what this is testing self.assertTrue(com.isnull(grouped['B'].nth(0)[0])) @unittest2.expectedFailure def test_new_in0140(self): """ Test new functionality in 0.14.0. This currently doesn't work. """ # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) ddf = self.psc.from_data_frame(df) g = ddf.groupby('A') result = g.first().collect() expected = df.iloc[[1, 2]].set_index('A') assert_frame_equal(result, expected) expected = df.iloc[[1, 2]].set_index('A') result = g.nth(0, dropna='any').collect() assert_frame_equal(result, expected) @unittest2.expectedFailure def test_first_last_nth_dtypes(self): """ We do groupby fine on mixed types, but our copy from local dataframe ends up re-running the guess type function, so the dtypes don't match. Issue #25 """ df = self.df_mixed_floats.copy() df['E'] = True df['F'] = 1 # tests for first / last / nth grouped = ddf.groupby('A') first = grouped.first().collect() expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last().collect() expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1).collect() expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] expected.index = Index(['bar', 'foo'], name='A') expected = expected.sort_index() assert_frame_equal(nth, expected) def test_var_on_multiplegroups(self): df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'data3': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby(['key1', 'key2']) grouped = df.groupby(['key1', 'key2']) assert_frame_equal(dgrouped.var().collect(), grouped.var()) def test_agg_api(self): # Note: needs a very recent version of pandas to pass # TODO(holden): Pass this test if local fails # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby('key1') grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = dgrouped.agg(peak_to_peak).collect() assert_frame_equal(result, expected) def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) dgrouped = self.psc.from_data_frame( self.tsframe).groupby( [lambda x: x.year, lambda x: x.month]) result = dgrouped.agg(np.mean).collect() expected = grouped.agg(np.mean) assert_frame_equal(result, expected)
_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') _frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') _mixed = DataFrame({'A': _frame['A'].copy(), 'B': _frame['B'].astype('float32'), 'C': _frame['C'].astype('int64'), 'D': _frame['D'].astype('int32')}) _mixed2 = DataFrame({'A': _frame2['A'].copy(), 'B': _frame2['B'].astype('float32'), 'C': _frame2['C'].astype('int64'), 'D': _frame2['D'].astype('int32')}) _integer = DataFrame( np.random.randint(1, 100, size=(10001, 4)), columns=list('ABCD'), dtype='int64') _integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), columns=list('ABCD'), dtype='int64') _frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=( _frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy())) _frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3), ItemC=_frame2.copy(), ItemD=_frame2.copy())) _integer_panel = Panel(dict(ItemA=_integer, ItemB=(_integer + 34).astype( 'int64'))) _integer2_panel = Panel(dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype( 'int64'))) _mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) class TestExpressions(tm.TestCase): _multiprocess_can_split_ = False
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]]) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.delevel()['A'].sortlevel) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) self.assert_(not swapped.index.equals(self.frame.index)) back = swapped.swaplevel(0, 1) self.assert_(back.index.equals(self.frame.index)) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): pass def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected)