コード例 #1
0
    def test_multi_assign(self):

        # GH 3626, an assignement of a sub-df to a df
        df = DataFrame({'FC':['a','b','a','b','a','b'],
                        'PF':[0,0,0,0,1,1],
                        'col1':range(6),
                        'col2':range(6,12)})
        df.ix[1,0]=np.nan
        df2 = df.copy()

        mask=~df2.FC.isnull()
        cols=['col1', 'col2']

        dft = df2 * 2
        dft.ix[3,3] = np.nan

        expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'],
                              'PF':[0,0,0,0,1,1],
                              'col1':Series([0,1,4,6,8,10]),
                              'col2':[12,7,16,np.nan,20,22]})


        # frame on rhs
        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)
        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)

        # with an ndarray on rhs
        df2 = df.copy()
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
コード例 #2
0
ファイル: test_indexing.py プロジェクト: rgbkrk/pandas
    def test_multi_assign(self):

        # GH 3626, an assignement of a sub-df to a df
        df = DataFrame({'FC':['a','b','a','b','a','b'],
                        'PF':[0,0,0,0,1,1],
                        'col1':lrange(6),
                        'col2':lrange(6,12)})
        df.ix[1,0]=np.nan
        df2 = df.copy()

        mask=~df2.FC.isnull()
        cols=['col1', 'col2']

        dft = df2 * 2
        dft.ix[3,3] = np.nan

        expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'],
                              'PF':[0,0,0,0,1,1],
                              'col1':Series([0,1,4,6,8,10]),
                              'col2':[12,7,16,np.nan,20,22]})


        # frame on rhs
        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)

        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)

        # with an ndarray on rhs
        df2 = df.copy()
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
コード例 #3
0
ファイル: test_indexing.py プロジェクト: rgbkrk/pandas
    def test_astype_assignment_with_iloc(self):

        # GH4312
        df_orig = DataFrame([['1','2','3','.4',5,6.,'foo']],columns=list('ABCDEFG'))

        df = df_orig.copy()
        df.iloc[:,0:3] = df.iloc[:,0:3].astype(int)
        result = df.get_dtype_counts().sort_index()
        expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
        assert_series_equal(result,expected)

        df = df_orig.copy()
        df.iloc[:,0:3] = df.iloc[:,0:3].convert_objects(convert_numeric=True)
        result = df.get_dtype_counts().sort_index()
        expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
コード例 #4
0
ファイル: test_filters.py プロジェクト: tzinckgraf/pandas
    def test_filter_and_transform_with_non_unique_timestamp_index(self):
        # GH4620
        t0 = Timestamp('2013-09-30 00:05:00')
        t1 = Timestamp('2013-10-30 00:05:00')
        t2 = Timestamp('2013-11-30 00:05:00')
        index = [t1, t1, t1, t2, t1, t1, t0, t1]
        df = DataFrame(
            {
                'pid': [1, 1, 1, 2, 2, 3, 3, 3],
                'tag': [23, 45, 62, 24, 45, 34, 25, 62]
            },
            index=index)
        grouped_df = df.groupby('tag')
        ser = df['pid']
        grouped_ser = ser.groupby(df['tag'])
        expected_indexes = [1, 2, 4, 7]

        # Filter DataFrame
        actual = grouped_df.filter(lambda x: len(x) > 1)
        expected = df.iloc[expected_indexes]
        assert_frame_equal(actual, expected)

        actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
        expected = df.copy()
        expected.iloc[[0, 3, 5, 6]] = np.nan
        assert_frame_equal(actual, expected)

        # Filter Series
        actual = grouped_ser.filter(lambda x: len(x) > 1)
        expected = ser.take(expected_indexes)
        assert_series_equal(actual, expected)

        actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
        NA = np.nan
        expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
        # ^ made manually because this can get confusing!
        assert_series_equal(actual, expected)

        # Transform Series
        actual = grouped_ser.transform(len)
        expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
        assert_series_equal(actual, expected)

        # Transform (a column from) DataFrameGroupBy
        actual = grouped_df.pid.transform(len)
        assert_series_equal(actual, expected)
コード例 #5
0
ファイル: test_filters.py プロジェクト: AlexisMignon/pandas
    def test_filter_and_transform_with_non_unique_timestamp_index(self):
        # GH4620
        t0 = Timestamp('2013-09-30 00:05:00')
        t1 = Timestamp('2013-10-30 00:05:00')
        t2 = Timestamp('2013-11-30 00:05:00')
        index = [t1, t1, t1, t2, t1, t1, t0, t1]
        df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
                        'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
        grouped_df = df.groupby('tag')
        ser = df['pid']
        grouped_ser = ser.groupby(df['tag'])
        expected_indexes = [1, 2, 4, 7]

        # Filter DataFrame
        actual = grouped_df.filter(lambda x: len(x) > 1)
        expected = df.iloc[expected_indexes]
        assert_frame_equal(actual, expected)

        actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
        expected = df.copy()
        expected.iloc[[0, 3, 5, 6]] = np.nan
        assert_frame_equal(actual, expected)

        # Filter Series
        actual = grouped_ser.filter(lambda x: len(x) > 1)
        expected = ser.take(expected_indexes)
        assert_series_equal(actual, expected)

        actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
        NA = np.nan
        expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
        # ^ made manually because this can get confusing!
        assert_series_equal(actual, expected)

        # Transform Series
        actual = grouped_ser.transform(len)
        expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
        assert_series_equal(actual, expected)

        # Transform (a column from) DataFrameGroupBy
        actual = grouped_df.pid.transform(len)
        assert_series_equal(actual, expected)
コード例 #6
0
    def test_ix_assign_column_mixed(self):
        # GH #1142
        df = DataFrame(tm.getSeriesData())
        df['foo'] = 'bar'

        orig = df.ix[:, 'B'].copy()
        df.ix[:, 'B'] = df.ix[:, 'B'] + 1
        assert_series_equal(df.B, orig + 1)

        # GH 3668, mixed frame with series value
        df = DataFrame({'x':range(10), 'y':range(10,20),'z' : 'bar'})
        expected = df.copy()
        expected.ix[0, 'y'] = 1000
        expected.ix[2, 'y'] = 1200
        expected.ix[4, 'y'] = 1400
        expected.ix[6, 'y'] = 1600
        expected.ix[8, 'y'] = 1800

        df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100
        assert_frame_equal(df,expected)
コード例 #7
0
ファイル: test_indexing.py プロジェクト: ElmPartnersAM/pandas
    def test_ix_assign_column_mixed(self):
        # GH #1142
        df = DataFrame(tm.getSeriesData())
        df['foo'] = 'bar'

        orig = df.ix[:, 'B'].copy()
        df.ix[:, 'B'] = df.ix[:, 'B'] + 1
        assert_series_equal(df.B, orig + 1)

        # GH 3668, mixed frame with series value
        df = DataFrame({'x':range(10), 'y':range(10,20),'z' : 'bar'})
        expected = df.copy()
        expected.ix[0, 'y'] = 1000
        expected.ix[2, 'y'] = 1200
        expected.ix[4, 'y'] = 1400
        expected.ix[6, 'y'] = 1600
        expected.ix[8, 'y'] = 1800

        df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100
        assert_frame_equal(df,expected)
コード例 #8
0
ファイル: test_indexing.py プロジェクト: rgbkrk/pandas
    def test_ix_assign_column_mixed(self):
        # GH #1142
        df = DataFrame(tm.getSeriesData())
        df['foo'] = 'bar'

        orig = df.ix[:, 'B'].copy()
        df.ix[:, 'B'] = df.ix[:, 'B'] + 1
        assert_series_equal(df.B, orig + 1)

        # GH 3668, mixed frame with series value
        df = DataFrame({'x':lrange(10), 'y':lrange(10,20),'z' : 'bar'})
        expected = df.copy()

        for i in range(5):
            indexer = i*2
            v = 1000 + i*200
            expected.ix[indexer, 'y'] = v
            self.assert_(expected.ix[indexer, 'y'] == v)

        df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100
        assert_frame_equal(df,expected)

        # GH 4508, making sure consistency of assignments
        df = DataFrame({'a':[1,2,3],'b':[0,1,2]})
        df.ix[[0,2,],'b'] = [100,-100]
        expected = DataFrame({'a' : [1,2,3], 'b' : [100,1,-100] })
        assert_frame_equal(df,expected)

        df = pd.DataFrame({'a': lrange(4) })
        df['b'] = np.nan
        df.ix[[1,3],'b'] = [100,-100]
        expected = DataFrame({'a' : [0,1,2,3], 'b' : [np.nan,100,np.nan,-100] })
        assert_frame_equal(df,expected)

        # ok, but chained assignments are dangerous
        df = pd.DataFrame({'a': lrange(4) })
        df['b'] = np.nan
        df['b'].ix[[1,3]] = [100,-100]
        assert_frame_equal(df,expected)
コード例 #9
0
ファイル: test_multilevel.py プロジェクト: GunioRobot/pandas
class TestMultiLevel(unittest.TestCase):

    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=['A', 'B', 'C'])

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

    def test_pickle(self):
        import cPickle
        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.toString(buf=buf)
        self.ymd.toString(buf=buf)
        self.frame.T.toString(buf=buf)
        self.ymd.T.toString(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s[42:65]
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s[49:51]
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s[42:65]).all())
        self.assert_(notnull(s[:42]).all())
        self.assert_(notnull(s[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_fancy_2d(self):
        result = self.frame.ix['foo', 'B']
        expected = self.frame.xs('foo')['B']
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix['B', 'foo']
        expected = ft.xs('B')['foo']
        assert_series_equal(result, expected)

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000,2):(2000,4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception,
                          self.frame.delevel()['A'].sortlevel)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

    def test_alignment(self):
        pass
コード例 #10
0
ファイル: test_aggregate.py プロジェクト: vanusy/pandas
class TestGroupByAggregate(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })

        self.df_mixed_floats = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.array(np.random.randn(8), dtype='float32')
        })

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3),
                                index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })

    def test_agg_api(self):

        # GH 6337
        # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
        # different api for agg when passed custom function with mixed frame

        df = DataFrame({
            'data1': np.random.randn(5),
            'data2': np.random.randn(5),
            'key1': ['a', 'a', 'b', 'b', 'a'],
            'key2': ['one', 'two', 'one', 'two', 'one']
        })
        grouped = df.groupby('key1')

        def peak_to_peak(arr):
            return arr.max() - arr.min()

        expected = grouped.agg([peak_to_peak])
        expected.columns = ['data1', 'data2']
        result = grouped.agg(peak_to_peak)
        assert_frame_equal(result, expected)

    def test_agg_regression1(self):
        grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
        result = grouped.agg(np.mean)
        expected = grouped.mean()
        assert_frame_equal(result, expected)

    def test_agg_datetimes_mixed(self):
        data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]]

        df1 = DataFrame({
            'key': [x[0] for x in data],
            'date': [x[1] for x in data],
            'value': [x[2] for x in data]
        })

        data = [[
            row[0],
            datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None,
            row[2]
        ] for row in data]

        df2 = DataFrame({
            'key': [x[0] for x in data],
            'date': [x[1] for x in data],
            'value': [x[2] for x in data]
        })

        df1['weights'] = df1['value'] / df1['value'].sum()
        gb1 = df1.groupby('date').aggregate(np.sum)

        df2['weights'] = df1['value'] / df1['value'].sum()
        gb2 = df2.groupby('date').aggregate(np.sum)

        assert (len(gb1) == len(gb2))

    def test_agg_period_index(self):
        from pandas import period_range, PeriodIndex
        prng = period_range('2012-1-1', freq='M', periods=3)
        df = DataFrame(np.random.randn(3, 2), index=prng)
        rs = df.groupby(level=0).sum()
        tm.assertIsInstance(rs.index, PeriodIndex)

        # GH 3579
        index = period_range(start='1999-01', periods=5, freq='M')
        s1 = Series(np.random.rand(len(index)), index=index)
        s2 = Series(np.random.rand(len(index)), index=index)
        series = [('s1', s1), ('s2', s2)]
        df = DataFrame.from_items(series)
        grouped = df.groupby(df.index.month)
        list(grouped)

    def test_agg_dict_parameter_cast_result_dtypes(self):
        # GH 12821

        df = DataFrame({
            'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
            'time': date_range('1/1/2011', periods=8, freq='H')
        })
        df.loc[[0, 1, 2, 5], 'time'] = None

        # test for `first` function
        exp = df.loc[[0, 3, 4, 6]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.first(), exp)
        assert_frame_equal(grouped.agg('first'), exp)
        assert_frame_equal(grouped.agg({'time': 'first'}), exp)
        assert_series_equal(grouped.time.first(), exp['time'])
        assert_series_equal(grouped.time.agg('first'), exp['time'])

        # test for `last` function
        exp = df.loc[[0, 3, 4, 7]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.last(), exp)
        assert_frame_equal(grouped.agg('last'), exp)
        assert_frame_equal(grouped.agg({'time': 'last'}), exp)
        assert_series_equal(grouped.time.last(), exp['time'])
        assert_series_equal(grouped.time.agg('last'), exp['time'])

    def test_agg_must_agg(self):
        grouped = self.df.groupby('A')['C']
        self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
        self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])

    def test_agg_ser_multi_key(self):
        # TODO(wesm): unused
        ser = self.df.C  # noqa

        f = lambda x: x.sum()
        results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
        expected = self.df.groupby(['A', 'B']).sum()['C']
        assert_series_equal(results, expected)

    def test_agg_apply_corner(self):
        # nothing to group, all NA
        grouped = self.ts.groupby(self.ts * np.nan)
        self.assertEqual(self.ts.dtype, np.float64)

        # groupby float64 values results in Float64Index
        exp = Series([],
                     dtype=np.float64,
                     index=pd.Index([], dtype=np.float64))
        assert_series_equal(grouped.sum(), exp)
        assert_series_equal(grouped.agg(np.sum), exp)
        assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)

        # DataFrame
        grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
        exp_df = DataFrame(columns=self.tsframe.columns,
                           dtype=float,
                           index=pd.Index([], dtype=np.float64))
        assert_frame_equal(grouped.sum(), exp_df, check_names=False)
        assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
        assert_frame_equal(grouped.apply(np.sum),
                           exp_df.iloc[:, :0],
                           check_names=False)

    def test_agg_grouping_is_list_tuple(self):
        from pandas.core.groupby import Grouping

        df = tm.makeTimeDataFrame()

        grouped = df.groupby(lambda x: x.year)
        grouper = grouped.grouper.groupings[0].grouper
        grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

        grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_aggregate_api_consistency(self):
        # GH 9052
        # make sure that the aggregates via dict
        # are consistent

        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'],
            'C':
            np.random.randn(8) + 1.0,
            'D':
            np.arange(8)
        })

        grouped = df.groupby(['A', 'B'])
        c_mean = grouped['C'].mean()
        c_sum = grouped['C'].sum()
        d_mean = grouped['D'].mean()
        d_sum = grouped['D'].sum()

        result = grouped['D'].agg(['sum', 'mean'])
        expected = pd.concat([d_sum, d_mean], axis=1)
        expected.columns = ['sum', 'mean']
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg([np.sum, np.mean])
        expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
        expected.columns = MultiIndex.from_product([['C', 'D'],
                                                    ['sum', 'mean']])
        assert_frame_equal(result, expected, check_like=True)

        result = grouped[['D', 'C']].agg([np.sum, np.mean])
        expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
        expected.columns = MultiIndex.from_product([['D', 'C'],
                                                    ['sum', 'mean']])
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg({'C': 'mean', 'D': 'sum'})
        expected = pd.concat([d_sum, c_mean], axis=1)
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']})
        expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
        expected.columns = MultiIndex.from_product([['C', 'D'],
                                                    ['mean', 'sum']])

        result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean})
        expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
        expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']])
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_compat(self):

        # GH 12334

        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'],
            'C':
            np.random.randn(8) + 1.0,
            'D':
            np.arange(8)
        })

        g = df.groupby(['A', 'B'])

        expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
        expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')])
        result = g['D'].agg({'C': ['sum', 'std']})
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
        expected.columns = ['C', 'D']
        result = g['D'].agg({'C': 'sum', 'D': 'std'})
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_nested_dicts(self):

        # API change for disallowing these types of nested dicts
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'],
            'C':
            np.random.randn(8) + 1.0,
            'D':
            np.arange(8)
        })

        g = df.groupby(['A', 'B'])

        def f():
            g.aggregate({
                'r1': {
                    'C': ['mean', 'sum']
                },
                'r2': {
                    'D': ['mean', 'sum']
                }
            })

        self.assertRaises(SpecificationError, f)

        result = g.agg({
            'C': {
                'ra': ['mean', 'std']
            },
            'D': {
                'rb': ['mean', 'std']
            }
        })
        expected = pd.concat(
            [g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'),
                                                      ('ra', 'std'),
                                                      ('rb', 'mean'),
                                                      ('rb', 'std')])
        assert_frame_equal(result, expected, check_like=True)

        # same name as the original column
        # GH9052
        expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
        expected = expected.rename(columns={'result1': 'D'})
        result = g['D'].agg({'D': np.sum, 'result2': np.mean})
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_python_multiindex(self):
        grouped = self.mframe.groupby(['A', 'B'])

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_aggregate_str_func(self):
        def _check_results(grouped):
            # single series
            result = grouped['A'].agg('std')
            expected = grouped['A'].std()
            assert_series_equal(result, expected)

            # group frame by function name
            result = grouped.aggregate('var')
            expected = grouped.var()
            assert_frame_equal(result, expected)

            # group frame by function dict
            result = grouped.agg(
                OrderedDict([['A', 'var'], ['B', 'std'], ['C', 'mean'],
                             ['D', 'sem']]))
            expected = DataFrame(
                OrderedDict([['A', grouped['A'].var()],
                             ['B', grouped['B'].std()],
                             ['C', grouped['C'].mean()],
                             ['D', grouped['D'].sem()]]))
            assert_frame_equal(result, expected)

        by_weekday = self.tsframe.groupby(lambda x: x.weekday())
        _check_results(by_weekday)

        by_mwkday = self.tsframe.groupby(
            [lambda x: x.month, lambda x: x.weekday()])
        _check_results(by_mwkday)

    def test_aggregate_item_by_item(self):

        df = self.df.copy()
        df['E'] = ['a'] * len(self.df)
        grouped = self.df.groupby('A')

        # API change in 0.11
        # def aggfun(ser):
        #     return len(ser + 'a')
        # result = grouped.agg(aggfun)
        # self.assertEqual(len(result.columns), 1)

        aggfun = lambda ser: ser.size
        result = grouped.agg(aggfun)
        foo = (self.df.A == 'foo').sum()
        bar = (self.df.A == 'bar').sum()
        K = len(result.columns)

        # GH5782
        # odd comparisons can result here, so cast to make easy
        exp = pd.Series(np.array([foo] * K),
                        index=list('BCD'),
                        dtype=np.float64,
                        name='foo')
        tm.assert_series_equal(result.xs('foo'), exp)

        exp = pd.Series(np.array([bar] * K),
                        index=list('BCD'),
                        dtype=np.float64,
                        name='bar')
        tm.assert_almost_equal(result.xs('bar'), exp)

        def aggfun(ser):
            return ser.size

        result = DataFrame().groupby(self.df.A).agg(aggfun)
        tm.assertIsInstance(result, DataFrame)
        self.assertEqual(len(result), 0)

    def test_agg_item_by_item_raise_typeerror(self):
        from numpy.random import randint

        df = DataFrame(randint(10, size=(20, 10)))

        def raiseException(df):
            pprint_thing('----------------------------------------')
            pprint_thing(df.to_string())
            raise TypeError

        self.assertRaises(TypeError, df.groupby(0).agg, raiseException)

    def test_series_agg_multikey(self):
        ts = tm.makeTimeSeries()
        grouped = ts.groupby([lambda x: x.year, lambda x: x.month])

        result = grouped.agg(np.sum)
        expected = grouped.sum()
        assert_series_equal(result, expected)

    def test_series_agg_multi_pure_python(self):
        data = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })

        def bad(x):
            assert (len(x.base) > 0)
            return 'foo'

        result = data.groupby(['A', 'B']).agg(bad)
        expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
        assert_frame_equal(result, expected)
コード例 #11
0
_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64')
_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64')
_mixed = DataFrame({'A': _frame['A'].copy(),
                    'B': _frame['B'].astype('float32'),
                    'C': _frame['C'].astype('int64'),
                    'D': _frame['D'].astype('int32')})
_mixed2 = DataFrame({'A': _frame2['A'].copy(),
                     'B': _frame2['B'].astype('float32'),
                     'C': _frame2['C'].astype('int64'),
                     'D': _frame2['D'].astype('int32')})
_integer = DataFrame(
    np.random.randint(1, 100,
                      size=(10001, 4)), columns=list('ABCD'), dtype='int64')
_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)),
                      columns=list('ABCD'), dtype='int64')
_frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=(
    _frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy()))
_frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3),
                           ItemC=_frame2.copy(), ItemD=_frame2.copy()))
_integer_panel = Panel(dict(ItemA=_integer, ItemB=(_integer + 34).astype(
    'int64')))
_integer2_panel = Panel(dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype(
    'int64')))
_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3)))
_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))


class TestExpressions(tm.TestCase):

    _multiprocess_can_split_ = False
コード例 #12
0
class PandasGroupby(SparklingPandasTestCase):
    def setUp(self):
        """
        Setup the dataframes used for the groupby tests derived from pandas
        """
        self.dateRange = bdate_range('1/1/2005', periods=250)
        self.stringIndex = Index([rands(8).upper() for x in range(250)])

        self.groupId = Series([x[0] for x in self.stringIndex],
                              index=self.stringIndex)
        self.groupDict = dict(
            (k, v) for k, v in compat.iteritems(self.groupId))

        self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])

        randMat = np.random.randn(250, 5)
        self.stringMatrix = DataFrame(randMat,
                                      columns=self.columnIndex,
                                      index=self.stringIndex)

        self.timeMatrix = DataFrame(randMat,
                                    columns=self.columnIndex,
                                    index=self.dateRange)
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })

        self.df_mixed_floats = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.array(np.random.randn(8), dtype='float32')
        })

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3),
                                index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })
        super(self.__class__, self).setUp()

    def test_first_last_nth(self):
        # tests for first / last / nth
        ddf = self.psc.from_data_frame(self.df)
        assert_frame_equal(ddf.collect(), self.df)
        grouped = self.psc.from_data_frame(self.df).groupby('A')
        first = grouped.first().collect()
        expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(first, expected)

        nth = grouped.nth(0).collect()
        assert_frame_equal(nth, expected)

        last = grouped.last().collect()
        expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
        expected.index = Index(['bar', 'foo'], name='A')
        assert_frame_equal(last, expected)

        nth = grouped.nth(-1).collect()
        assert_frame_equal(nth, expected)

        nth = grouped.nth(1).collect()
        expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy()
        expected.index = Index(['foo', 'bar'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(nth, expected)

    @unittest2.expectedFailure
    def test_getitem(self):
        # it works!
        grouped['B'].first()
        grouped['B'].last()
        grouped['B'].nth(0)

        self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
        self.assertTrue(com.isnull(grouped['B'].first()['foo']))
        self.assertTrue(com.isnull(grouped['B'].last()['foo']))
        # not sure what this is testing
        self.assertTrue(com.isnull(grouped['B'].nth(0)[0]))

    @unittest2.expectedFailure
    def test_new_in0140(self):
        """
        Test new functionality in 0.14.0. This currently doesn't work.
        """
        # v0.14.0 whatsnew
        df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
        ddf = self.psc.from_data_frame(df)
        g = ddf.groupby('A')
        result = g.first().collect()
        expected = df.iloc[[1, 2]].set_index('A')
        assert_frame_equal(result, expected)

        expected = df.iloc[[1, 2]].set_index('A')
        result = g.nth(0, dropna='any').collect()
        assert_frame_equal(result, expected)

    @unittest2.expectedFailure
    def test_first_last_nth_dtypes(self):
        """
        We do groupby fine on mixed types, but our copy from local dataframe
        ends up re-running the guess type function, so the dtypes don't match.
        Issue #25
        """
        df = self.df_mixed_floats.copy()
        df['E'] = True
        df['F'] = 1

        # tests for first / last / nth
        grouped = ddf.groupby('A')
        first = grouped.first().collect()
        expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(first, expected)

        last = grouped.last().collect()
        expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(last, expected)

        nth = grouped.nth(1).collect()
        expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(nth, expected)

    def test_var_on_multiplegroups(self):
        df = DataFrame({
            'data1': np.random.randn(5),
            'data2': np.random.randn(5),
            'data3': np.random.randn(5),
            'key1': ['a', 'a', 'b', 'b', 'a'],
            'key2': ['one', 'two', 'one', 'two', 'one']
        })
        ddf = self.psc.from_data_frame(df)
        dgrouped = ddf.groupby(['key1', 'key2'])
        grouped = df.groupby(['key1', 'key2'])
        assert_frame_equal(dgrouped.var().collect(), grouped.var())

    def test_agg_api(self):
        # Note: needs a very recent version of pandas to pass
        # TODO(holden): Pass this test if local fails
        # GH 6337
        # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
        # different api for agg when passed custom function with mixed frame

        df = DataFrame({
            'data1': np.random.randn(5),
            'data2': np.random.randn(5),
            'key1': ['a', 'a', 'b', 'b', 'a'],
            'key2': ['one', 'two', 'one', 'two', 'one']
        })
        ddf = self.psc.from_data_frame(df)
        dgrouped = ddf.groupby('key1')
        grouped = df.groupby('key1')

        def peak_to_peak(arr):
            return arr.max() - arr.min()

        expected = grouped.agg([peak_to_peak])
        expected.columns = ['data1', 'data2']
        result = dgrouped.agg(peak_to_peak).collect()
        assert_frame_equal(result, expected)

    def test_agg_regression1(self):
        grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
        dgrouped = self.psc.from_data_frame(self.tsframe).groupby(
            [lambda x: x.year, lambda x: x.month])
        result = dgrouped.agg(np.mean).collect()
        expected = grouped.agg(np.mean)
        assert_frame_equal(result, expected)
コード例 #13
0
ファイル: test_moments.py プロジェクト: ashokez/pandas
class TestMoments(unittest.TestCase):

    _nan_locs = np.arange(20, 40)
    _inf_locs = np.array([])

    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = DateRange(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K), index=self.rng,
                               columns=np.arange(K))

    def test_rolling_sum(self):
        self._check_moment_func(mom.rolling_sum, np.sum)

    def test_rolling_count(self):
        counter = lambda x: np.isfinite(x).astype(float).sum()
        self._check_moment_func(mom.rolling_count, counter,
                                has_min_periods=False,
                                preserve_nan=False)

    def test_rolling_mean(self):
        self._check_moment_func(mom.rolling_mean, np.mean)

    def test_rolling_median(self):
        self._check_moment_func(mom.rolling_median, np.median)

    def test_rolling_min(self):
        self._check_moment_func(mom.rolling_min, np.min)

    def test_rolling_max(self):
        self._check_moment_func(mom.rolling_max, np.max)

    def test_rolling_quantile(self):
        qs = [.1, .5, .9]

        def scoreatpercentile(a, per):
            values = np.sort(a,axis=0)

            idx = per /1. * (values.shape[0] - 1)
            return values[int(idx)]

        for q in qs:
            def f(x, window, min_periods=None, time_rule=None):
                return mom.rolling_quantile(x, window, q,
                                                min_periods=min_periods,
                                                time_rule=time_rule)
            def alt(x):
                return scoreatpercentile(x, q)

            self._check_moment_func(f, alt)

    def test_rolling_apply(self):
        def roll_mean(x, window, min_periods=None, time_rule=None):
            return mom.rolling_apply(x, window,
                                         lambda x: x[np.isfinite(x)].mean(),
                                         min_periods=min_periods,
                                         time_rule=time_rule)
        self._check_moment_func(roll_mean, np.mean)

    def test_rolling_std(self):
        self._check_moment_func(mom.rolling_std,
                                lambda x: np.std(x, ddof=1))

    def test_rolling_var(self):
        self._check_moment_func(mom.rolling_var,
                                lambda x: np.var(x, ddof=1))

    def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False))

    def test_rolling_kurt(self):
        try:
            from scipy.stats import kurtosis
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_kurt,
                                lambda x: kurtosis(x, bias=False))

    def _check_moment_func(self, func, static_comp, window=50,
                           has_min_periods=True,
                           has_time_rule=True,
                           preserve_nan=True):

        self._check_ndarray(func, static_comp, window=window,
                            has_min_periods=has_min_periods,
                            preserve_nan=preserve_nan)

        self._check_structures(func, static_comp,
                               has_min_periods=has_min_periods,
                               has_time_rule=has_time_rule)

    def _check_ndarray(self, func, static_comp, window=50,
                       has_min_periods=True,
                       preserve_nan=True):

        result = func(self.arr, window)
        assert_almost_equal(result[-1],
                            static_comp(self.arr[-50:]))

        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        if has_min_periods:
            result = func(arr, 50, min_periods=30)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

            # min_periods is working correctly
            result = func(arr, 20, min_periods=15)
            self.assert_(np.isnan(result[23]))
            self.assert_(not np.isnan(result[24]))

            self.assert_(not np.isnan(result[-6]))
            self.assert_(np.isnan(result[-5]))

            # min_periods=0
            result0 = func(arr, 20, min_periods=0)
            result1 = func(arr, 20, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr, 50)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

    def _check_structures(self, func, static_comp,
                          has_min_periods=True, has_time_rule=True):

        series_result = func(self.series, 50)
        self.assert_(isinstance(series_result, Series))

        frame_result = func(self.frame, 50)
        self.assertEquals(type(frame_result), DataFrame)

        # check time_rule works
        if has_time_rule:
            win = 25
            minp = 10

            if has_min_periods:
                series_result = func(self.series[::2], win, min_periods=minp,
                                     time_rule='WEEKDAY')
                frame_result = func(self.frame[::2], win, min_periods=minp,
                                    time_rule='WEEKDAY')
            else:
                series_result = func(self.series[::2], win, time_rule='WEEKDAY')
                frame_result = func(self.frame[::2], win, time_rule='WEEKDAY')

            last_date = series_result.index[-1]
            prev_date = last_date - 24 * datetools.bday

            trunc_series = self.series[::2].truncate(prev_date, last_date)
            trunc_frame = self.frame[::2].truncate(prev_date, last_date)

            assert_almost_equal(series_result[-1], static_comp(trunc_series))

            assert_almost_equal(frame_result.xs(last_date),
                                trunc_frame.apply(static_comp))

    def test_ewma(self):
        self._check_ew(mom.ewma)

    def test_ewmvar(self):
        self._check_ew(mom.ewmvar)

    def test_ewmvol(self):
        self._check_ew(mom.ewmvol)

    def test_ewma_span_com_args(self):
        A = mom.ewma(self.arr, com=9.5)
        B = mom.ewma(self.arr, span=20)
        assert_almost_equal(A, B)

        self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20)
        self.assertRaises(Exception, mom.ewma, self.arr)

    def _check_ew(self, func):
        self._check_ew_ndarray(func)
        self._check_ew_structures(func)

    def _check_ew_ndarray(self, func, preserve_nan=False):
        result = func(self.arr, com=10)
        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        # ??? check something

        # pass in ints
        result2 = func(np.arange(50), span=10)
        self.assert_(result.dtype == np.float_)

    def _check_ew_structures(self, func):
        series_result = func(self.series, com=10)
        self.assert_(isinstance(series_result, Series))
        frame_result = func(self.frame, com=10)
        self.assertEquals(type(frame_result), DataFrame)

    # binary moments
    def test_rolling_cov(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_cov(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])

    def test_rolling_corr(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_corr(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])

        # test for correct bias correction
        a = tm.makeTimeSeries()
        b = tm.makeTimeSeries()
        a[:5] = np.nan
        b[:10] = np.nan

        result = mom.rolling_corr(a, b, len(a), min_periods=1)
        assert_almost_equal(result[-1], a.corr(b))

    def test_rolling_corr_pairwise(self):
        panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5)

        correl = panel.ix[:, 1, 5]
        exp = mom.rolling_corr(self.frame[1], self.frame[5],
                               10, min_periods=5)
        tm.assert_series_equal(correl, exp)

    def test_flex_binary_frame(self):
        def _check(method):
            series = self.frame[1]

            res = method(series, self.frame, 10)
            res2 = method(self.frame, series, 10)
            exp = self.frame.apply(lambda x: method(series, x, 10))

            tm.assert_frame_equal(res, exp)
            tm.assert_frame_equal(res2, exp)

            frame2 = self.frame.copy()
            frame2.values[:] = np.random.randn(*frame2.shape)

            res3 = method(self.frame, frame2, 10)
            exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10))
                                 for k in self.frame))
            tm.assert_frame_equal(res3, exp)

        methods = [mom.rolling_corr, mom.rolling_cov]
        for meth in methods:
            _check(meth)

    def test_ewmcov(self):
        self._check_binary_ew(mom.ewmcov)

    def test_ewmcorr(self):
        self._check_binary_ew(mom.ewmcorr)

    def _check_binary_ew(self, func):
        A = Series(randn(50), index=np.arange(50))
        B = A[2:] + randn(48)

        A[:10] = np.NaN
        B[-10:] = np.NaN

        result = func(A, B, 20, min_periods=5)

        self.assert_(np.isnan(result[:15]).all())
        self.assert_(not np.isnan(result[15:]).any())

        self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
コード例 #14
0
ファイル: test_indexing.py プロジェクト: rgbkrk/pandas
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        ### series ###
        s_orig = Series([1,2,3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1,2,3,5],index=[0,1,2,5])
        assert_series_equal(s,expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1,2,3,5],index=[0,1,2,5])
        assert_series_equal(s,expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1,2,3,5.],index=[0,1,2,5])
        assert_series_equal(s,expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1,2,3,5.],index=[0,1,2,5])
        assert_series_equal(s,expected)

        # iloc/iat raise
        s = s_orig.copy()
        def f():
            s.iloc[3] = 5.
        self.assertRaises(IndexError, f)
        def f():
            s.iat[3] = 5.
        self.assertRaises(IndexError, f)

        ### frame ###

        df_orig = DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])

        # iloc/iat raise
        df = df_orig.copy()
        def f():
            df.iloc[4,2] = 5.
        self.assertRaises(IndexError, f)
        def f():
            df.iat[4,2] = 5.
        self.assertRaises(IndexError, f)

        # row setting where it exists
        expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] }))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        assert_frame_equal(df,expected)

        expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] }))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        assert_frame_equal(df,expected)

        expected = DataFrame(dict({ 'A' : [0,2,4,4], 'B' : [1,3,5,5] }),dtype='float64')
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        assert_frame_equal(df,expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : [0,2,4] }))
        df = df_orig.copy()
        df.ix[:,'B'] = df.ix[:,'A']
        assert_frame_equal(df,expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : Series([0.,2.,4.]) }))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        df.ix[:,'B'] = df.ix[:,'A']
        assert_frame_equal(df,expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A'].astype(np.float64)
        df = df_orig.copy()
        df.ix[:,'C'] = df.ix[:,'A']
        assert_frame_equal(df,expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A'].astype(np.float64)
        df = df_orig.copy()
        df.ix[:,'C'] = df.ix[:,'A']
        assert_frame_equal(df,expected)

        ### panel ###
        p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64')

        # panel setting via item
        p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64')
        expected = p_orig.copy()
        expected['Item3'] = expected['Item1']
        p = p_orig.copy()
        p.loc['Item3'] = p['Item1']
        assert_panel_equal(p,expected)

        # panel with aligned series
        expected = p_orig.copy()
        expected = expected.transpose(2,1,0)
        expected['C'] = DataFrame({ 'Item1' : [30,30,30,30], 'Item2' : [32,32,32,32] },index=p_orig.major_axis)
        expected = expected.transpose(2,1,0)
        p = p_orig.copy()
        p.loc[:,:,'C'] = Series([30,32],index=p_orig.items)
        assert_panel_equal(p,expected)
コード例 #15
0
ファイル: test_aggregate.py プロジェクト: AlexisMignon/pandas
class TestGroupByAggregate(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame(
            {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
             'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
             'C': np.random.randn(8),
             'D': np.random.randn(8)})

        self.df_mixed_floats = DataFrame(
            {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
             'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
             'C': np.random.randn(8),
             'D': np.array(
                 np.random.randn(8), dtype='float32')})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
                                                                  'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3), index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame(
            {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
                   'foo', 'foo', 'foo'],
             'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
                   'two', 'two', 'one'],
             'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                   'dull', 'shiny', 'shiny', 'shiny'],
             'D': np.random.randn(11),
             'E': np.random.randn(11),
             'F': np.random.randn(11)})

    def test_agg_api(self):

        # GH 6337
        # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
        # different api for agg when passed custom function with mixed frame

        df = DataFrame({'data1': np.random.randn(5),
                        'data2': np.random.randn(5),
                        'key1': ['a', 'a', 'b', 'b', 'a'],
                        'key2': ['one', 'two', 'one', 'two', 'one']})
        grouped = df.groupby('key1')

        def peak_to_peak(arr):
            return arr.max() - arr.min()

        expected = grouped.agg([peak_to_peak])
        expected.columns = ['data1', 'data2']
        result = grouped.agg(peak_to_peak)
        assert_frame_equal(result, expected)

    def test_agg_regression1(self):
        grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
        result = grouped.agg(np.mean)
        expected = grouped.mean()
        assert_frame_equal(result, expected)

    def test_agg_datetimes_mixed(self):
        data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]]

        df1 = DataFrame({'key': [x[0] for x in data],
                         'date': [x[1] for x in data],
                         'value': [x[2] for x in data]})

        data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1]
                 else None, row[2]] for row in data]

        df2 = DataFrame({'key': [x[0] for x in data],
                         'date': [x[1] for x in data],
                         'value': [x[2] for x in data]})

        df1['weights'] = df1['value'] / df1['value'].sum()
        gb1 = df1.groupby('date').aggregate(np.sum)

        df2['weights'] = df1['value'] / df1['value'].sum()
        gb2 = df2.groupby('date').aggregate(np.sum)

        assert (len(gb1) == len(gb2))

    def test_agg_period_index(self):
        from pandas import period_range, PeriodIndex
        prng = period_range('2012-1-1', freq='M', periods=3)
        df = DataFrame(np.random.randn(3, 2), index=prng)
        rs = df.groupby(level=0).sum()
        tm.assertIsInstance(rs.index, PeriodIndex)

        # GH 3579
        index = period_range(start='1999-01', periods=5, freq='M')
        s1 = Series(np.random.rand(len(index)), index=index)
        s2 = Series(np.random.rand(len(index)), index=index)
        series = [('s1', s1), ('s2', s2)]
        df = DataFrame.from_items(series)
        grouped = df.groupby(df.index.month)
        list(grouped)

    def test_agg_dict_parameter_cast_result_dtypes(self):
        # GH 12821

        df = DataFrame(
            {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
             'time': date_range('1/1/2011', periods=8, freq='H')})
        df.loc[[0, 1, 2, 5], 'time'] = None

        # test for `first` function
        exp = df.loc[[0, 3, 4, 6]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.first(), exp)
        assert_frame_equal(grouped.agg('first'), exp)
        assert_frame_equal(grouped.agg({'time': 'first'}), exp)
        assert_series_equal(grouped.time.first(), exp['time'])
        assert_series_equal(grouped.time.agg('first'), exp['time'])

        # test for `last` function
        exp = df.loc[[0, 3, 4, 7]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.last(), exp)
        assert_frame_equal(grouped.agg('last'), exp)
        assert_frame_equal(grouped.agg({'time': 'last'}), exp)
        assert_series_equal(grouped.time.last(), exp['time'])
        assert_series_equal(grouped.time.agg('last'), exp['time'])

    def test_agg_must_agg(self):
        grouped = self.df.groupby('A')['C']
        self.assertRaises(Exception, grouped.agg, lambda x: x.describe())
        self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2])

    def test_agg_ser_multi_key(self):
        # TODO(wesm): unused
        ser = self.df.C  # noqa

        f = lambda x: x.sum()
        results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
        expected = self.df.groupby(['A', 'B']).sum()['C']
        assert_series_equal(results, expected)

    def test_agg_apply_corner(self):
        # nothing to group, all NA
        grouped = self.ts.groupby(self.ts * np.nan)
        self.assertEqual(self.ts.dtype, np.float64)

        # groupby float64 values results in Float64Index
        exp = Series([], dtype=np.float64, index=pd.Index(
            [], dtype=np.float64))
        assert_series_equal(grouped.sum(), exp)
        assert_series_equal(grouped.agg(np.sum), exp)
        assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)

        # DataFrame
        grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
        exp_df = DataFrame(columns=self.tsframe.columns, dtype=float,
                           index=pd.Index([], dtype=np.float64))
        assert_frame_equal(grouped.sum(), exp_df, check_names=False)
        assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
        assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
                           check_names=False)

    def test_agg_grouping_is_list_tuple(self):
        from pandas.core.groupby import Grouping

        df = tm.makeTimeDataFrame()

        grouped = df.groupby(lambda x: x.year)
        grouper = grouped.grouper.groupings[0].grouper
        grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

        grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_aggregate_api_consistency(self):
        # GH 9052
        # make sure that the aggregates via dict
        # are consistent

        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        grouped = df.groupby(['A', 'B'])
        c_mean = grouped['C'].mean()
        c_sum = grouped['C'].sum()
        d_mean = grouped['D'].mean()
        d_sum = grouped['D'].sum()

        result = grouped['D'].agg(['sum', 'mean'])
        expected = pd.concat([d_sum, d_mean],
                             axis=1)
        expected.columns = ['sum', 'mean']
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg([np.sum, np.mean])
        expected = pd.concat([c_sum,
                              c_mean,
                              d_sum,
                              d_mean],
                             axis=1)
        expected.columns = MultiIndex.from_product([['C', 'D'],
                                                    ['sum', 'mean']])
        assert_frame_equal(result, expected, check_like=True)

        result = grouped[['D', 'C']].agg([np.sum, np.mean])
        expected = pd.concat([d_sum,
                              d_mean,
                              c_sum,
                              c_mean],
                             axis=1)
        expected.columns = MultiIndex.from_product([['D', 'C'],
                                                    ['sum', 'mean']])
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg({'C': 'mean', 'D': 'sum'})
        expected = pd.concat([d_sum,
                              c_mean],
                             axis=1)
        assert_frame_equal(result, expected, check_like=True)

        result = grouped.agg({'C': ['mean', 'sum'],
                              'D': ['mean', 'sum']})
        expected = pd.concat([c_mean,
                              c_sum,
                              d_mean,
                              d_sum],
                             axis=1)
        expected.columns = MultiIndex.from_product([['C', 'D'],
                                                    ['mean', 'sum']])

        result = grouped[['D', 'C']].agg({'r': np.sum,
                                          'r2': np.mean})
        expected = pd.concat([d_sum,
                              c_sum,
                              d_mean,
                              c_mean],
                             axis=1)
        expected.columns = MultiIndex.from_product([['r', 'r2'],
                                                    ['D', 'C']])
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_compat(self):

        # GH 12334

        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        g = df.groupby(['A', 'B'])

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = MultiIndex.from_tuples([('C', 'sum'),
                                                   ('C', 'std')])
        result = g['D'].agg({'C': ['sum', 'std']})
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = ['C', 'D']
        result = g['D'].agg({'C': 'sum', 'D': 'std'})
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_nested_dicts(self):

        # API change for disallowing these types of nested dicts
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        g = df.groupby(['A', 'B'])

        def f():
            g.aggregate({'r1': {'C': ['mean', 'sum']},
                         'r2': {'D': ['mean', 'sum']}})

        self.assertRaises(SpecificationError, f)

        result = g.agg({'C': {'ra': ['mean', 'std']},
                        'D': {'rb': ['mean', 'std']}})
        expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(),
                              g['D'].std()], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
            'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
        assert_frame_equal(result, expected, check_like=True)

        # same name as the original column
        # GH9052
        expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
        expected = expected.rename(columns={'result1': 'D'})
        result = g['D'].agg({'D': np.sum, 'result2': np.mean})
        assert_frame_equal(result, expected, check_like=True)

    def test_agg_python_multiindex(self):
        grouped = self.mframe.groupby(['A', 'B'])

        result = grouped.agg(np.mean)
        expected = grouped.mean()
        tm.assert_frame_equal(result, expected)

    def test_aggregate_str_func(self):
        def _check_results(grouped):
            # single series
            result = grouped['A'].agg('std')
            expected = grouped['A'].std()
            assert_series_equal(result, expected)

            # group frame by function name
            result = grouped.aggregate('var')
            expected = grouped.var()
            assert_frame_equal(result, expected)

            # group frame by function dict
            result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'],
                                              ['C', 'mean'], ['D', 'sem']]))
            expected = DataFrame(OrderedDict([['A', grouped['A'].var(
            )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()],
                ['D', grouped['D'].sem()]]))
            assert_frame_equal(result, expected)

        by_weekday = self.tsframe.groupby(lambda x: x.weekday())
        _check_results(by_weekday)

        by_mwkday = self.tsframe.groupby([lambda x: x.month,
                                          lambda x: x.weekday()])
        _check_results(by_mwkday)

    def test_aggregate_item_by_item(self):

        df = self.df.copy()
        df['E'] = ['a'] * len(self.df)
        grouped = self.df.groupby('A')

        # API change in 0.11
        # def aggfun(ser):
        #     return len(ser + 'a')
        # result = grouped.agg(aggfun)
        # self.assertEqual(len(result.columns), 1)

        aggfun = lambda ser: ser.size
        result = grouped.agg(aggfun)
        foo = (self.df.A == 'foo').sum()
        bar = (self.df.A == 'bar').sum()
        K = len(result.columns)

        # GH5782
        # odd comparisons can result here, so cast to make easy
        exp = pd.Series(np.array([foo] * K), index=list('BCD'),
                        dtype=np.float64, name='foo')
        tm.assert_series_equal(result.xs('foo'), exp)

        exp = pd.Series(np.array([bar] * K), index=list('BCD'),
                        dtype=np.float64, name='bar')
        tm.assert_almost_equal(result.xs('bar'), exp)

        def aggfun(ser):
            return ser.size

        result = DataFrame().groupby(self.df.A).agg(aggfun)
        tm.assertIsInstance(result, DataFrame)
        self.assertEqual(len(result), 0)

    def test_agg_item_by_item_raise_typeerror(self):
        from numpy.random import randint

        df = DataFrame(randint(10, size=(20, 10)))

        def raiseException(df):
            pprint_thing('----------------------------------------')
            pprint_thing(df.to_string())
            raise TypeError

        self.assertRaises(TypeError, df.groupby(0).agg, raiseException)

    def test_series_agg_multikey(self):
        ts = tm.makeTimeSeries()
        grouped = ts.groupby([lambda x: x.year, lambda x: x.month])

        result = grouped.agg(np.sum)
        expected = grouped.sum()
        assert_series_equal(result, expected)

    def test_series_agg_multi_pure_python(self):
        data = DataFrame(
            {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
                   'foo', 'foo', 'foo'],
             'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
                   'two', 'two', 'one'],
             'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                   'dull', 'shiny', 'shiny', 'shiny'],
             'D': np.random.randn(11),
             'E': np.random.randn(11),
             'F': np.random.randn(11)})

        def bad(x):
            assert (len(x.base) > 0)
            return 'foo'

        result = data.groupby(['A', 'B']).agg(bad)
        expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
        assert_frame_equal(result, expected)
コード例 #16
0
    'D': _frame['D'].astype('int32')
})
_mixed2 = DataFrame({
    'A': _frame2['A'].copy(),
    'B': _frame2['B'].astype('float32'),
    'C': _frame2['C'].astype('int64'),
    'D': _frame2['D'].astype('int32')
})
_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)),
                     columns=list('ABCD'),
                     dtype='int64')
_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)),
                      columns=list('ABCD'),
                      dtype='int64')
_frame_panel = Panel(
    dict(ItemA=_frame.copy(),
         ItemB=(_frame.copy() + 3),
         ItemC=_frame.copy(),
         ItemD=_frame.copy()))
_frame2_panel = Panel(
    dict(ItemA=_frame2.copy(),
         ItemB=(_frame2.copy() + 3),
         ItemC=_frame2.copy(),
         ItemD=_frame2.copy()))
_integer_panel = Panel(
    dict(ItemA=_integer, ItemB=(_integer + 34).astype('int64')))
_integer2_panel = Panel(
    dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype('int64')))
_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3)))
_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))
コード例 #17
0
class TestMoments(unittest.TestCase):

    _nan_locs = np.arange(20, 40)
    _inf_locs = np.array([])

    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = DateRange(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K),
                               index=self.rng,
                               columns=np.arange(K))

    def test_rolling_sum(self):
        self._check_moment_func(mom.rolling_sum, np.sum)

    def test_rolling_count(self):
        counter = lambda x: np.isfinite(x).astype(float).sum()
        self._check_moment_func(mom.rolling_count,
                                counter,
                                has_min_periods=False,
                                preserve_nan=False)

    def test_rolling_mean(self):
        self._check_moment_func(mom.rolling_mean, np.mean)

    def test_rolling_median(self):
        self._check_moment_func(mom.rolling_median, np.median)

    def test_rolling_min(self):
        self._check_moment_func(mom.rolling_min, np.min)

    def test_rolling_max(self):
        self._check_moment_func(mom.rolling_max, np.max)

    def test_rolling_quantile(self):
        qs = [.1, .5, .9]

        def scoreatpercentile(a, per):
            values = np.sort(a, axis=0)

            idx = per / 1. * (values.shape[0] - 1)
            return values[int(idx)]

        for q in qs:

            def f(x, window, min_periods=None, time_rule=None):
                return mom.rolling_quantile(x,
                                            window,
                                            q,
                                            min_periods=min_periods,
                                            time_rule=time_rule)

            def alt(x):
                return scoreatpercentile(x, q)

            self._check_moment_func(f, alt)

    def test_rolling_apply(self):
        def roll_mean(x, window, min_periods=None, time_rule=None):
            return mom.rolling_apply(x,
                                     window,
                                     lambda x: x[np.isfinite(x)].mean(),
                                     min_periods=min_periods,
                                     time_rule=time_rule)

        self._check_moment_func(roll_mean, np.mean)

    def test_rolling_std(self):
        self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1))

    def test_rolling_var(self):
        self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1))

    def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False))

    def test_rolling_kurt(self):
        try:
            from scipy.stats import kurtosis
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_kurt,
                                lambda x: kurtosis(x, bias=False))

    def _check_moment_func(self,
                           func,
                           static_comp,
                           window=50,
                           has_min_periods=True,
                           has_time_rule=True,
                           preserve_nan=True):

        self._check_ndarray(func,
                            static_comp,
                            window=window,
                            has_min_periods=has_min_periods,
                            preserve_nan=preserve_nan)

        self._check_structures(func,
                               static_comp,
                               has_min_periods=has_min_periods,
                               has_time_rule=has_time_rule)

    def _check_ndarray(self,
                       func,
                       static_comp,
                       window=50,
                       has_min_periods=True,
                       preserve_nan=True):

        result = func(self.arr, window)
        assert_almost_equal(result[-1], static_comp(self.arr[-50:]))

        if preserve_nan:
            assert (np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        if has_min_periods:
            result = func(arr, 50, min_periods=30)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

            # min_periods is working correctly
            result = func(arr, 20, min_periods=15)
            self.assert_(np.isnan(result[23]))
            self.assert_(not np.isnan(result[24]))

            self.assert_(not np.isnan(result[-6]))
            self.assert_(np.isnan(result[-5]))

            # min_periods=0
            result0 = func(arr, 20, min_periods=0)
            result1 = func(arr, 20, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr, 50)
            assert_almost_equal(result[-1], static_comp(arr[10:-10]))

    def _check_structures(self,
                          func,
                          static_comp,
                          has_min_periods=True,
                          has_time_rule=True):

        series_result = func(self.series, 50)
        self.assert_(isinstance(series_result, Series))

        frame_result = func(self.frame, 50)
        self.assertEquals(type(frame_result), DataFrame)

        # check time_rule works
        if has_time_rule:
            win = 25
            minp = 10

            if has_min_periods:
                series_result = func(self.series[::2],
                                     win,
                                     min_periods=minp,
                                     time_rule='WEEKDAY')
                frame_result = func(self.frame[::2],
                                    win,
                                    min_periods=minp,
                                    time_rule='WEEKDAY')
            else:
                series_result = func(self.series[::2],
                                     win,
                                     time_rule='WEEKDAY')
                frame_result = func(self.frame[::2], win, time_rule='WEEKDAY')

            last_date = series_result.index[-1]
            prev_date = last_date - 24 * datetools.bday

            trunc_series = self.series[::2].truncate(prev_date, last_date)
            trunc_frame = self.frame[::2].truncate(prev_date, last_date)

            assert_almost_equal(series_result[-1], static_comp(trunc_series))

            assert_almost_equal(frame_result.xs(last_date),
                                trunc_frame.apply(static_comp))

    def test_ewma(self):
        self._check_ew(mom.ewma)

    def test_ewmvar(self):
        self._check_ew(mom.ewmvar)

    def test_ewmvol(self):
        self._check_ew(mom.ewmvol)

    def test_ewma_span_com_args(self):
        A = mom.ewma(self.arr, com=9.5)
        B = mom.ewma(self.arr, span=20)
        assert_almost_equal(A, B)

        self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20)
        self.assertRaises(Exception, mom.ewma, self.arr)

    def _check_ew(self, func):
        self._check_ew_ndarray(func)
        self._check_ew_structures(func)

    def _check_ew_ndarray(self, func, preserve_nan=False):
        result = func(self.arr, com=10)
        if preserve_nan:
            assert (np.isnan(result[self._nan_locs]).all())

        # excluding NaNs correctly
        arr = randn(50)
        arr[:10] = np.NaN
        arr[-10:] = np.NaN

        # ??? check something

        # pass in ints
        result2 = func(np.arange(50), span=10)
        self.assert_(result.dtype == np.float_)

    def _check_ew_structures(self, func):
        series_result = func(self.series, com=10)
        self.assert_(isinstance(series_result, Series))
        frame_result = func(self.frame, com=10)
        self.assertEquals(type(frame_result), DataFrame)

    # binary moments
    def test_rolling_cov(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_cov(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])

    def test_rolling_corr(self):
        A = self.series
        B = A + randn(len(A))

        result = mom.rolling_corr(A, B, 50, min_periods=25)
        assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])

        # test for correct bias correction
        a = tm.makeTimeSeries()
        b = tm.makeTimeSeries()
        a[:5] = np.nan
        b[:10] = np.nan

        result = mom.rolling_corr(a, b, len(a), min_periods=1)
        assert_almost_equal(result[-1], a.corr(b))

    def test_rolling_corr_pairwise(self):
        panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5)

        correl = panel.ix[:, 1, 5]
        exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5)
        tm.assert_series_equal(correl, exp)

    def test_flex_binary_frame(self):
        def _check(method):
            series = self.frame[1]

            res = method(series, self.frame, 10)
            res2 = method(self.frame, series, 10)
            exp = self.frame.apply(lambda x: method(series, x, 10))

            tm.assert_frame_equal(res, exp)
            tm.assert_frame_equal(res2, exp)

            frame2 = self.frame.copy()
            frame2.values[:] = np.random.randn(*frame2.shape)

            res3 = method(self.frame, frame2, 10)
            exp = DataFrame(
                dict((k, method(self.frame[k], frame2[k], 10))
                     for k in self.frame))
            tm.assert_frame_equal(res3, exp)

        methods = [mom.rolling_corr, mom.rolling_cov]
        for meth in methods:
            _check(meth)

    def test_ewmcov(self):
        self._check_binary_ew(mom.ewmcov)

    def test_ewmcorr(self):
        self._check_binary_ew(mom.ewmcorr)

    def _check_binary_ew(self, func):
        A = Series(randn(50), index=np.arange(50))
        B = A[2:] + randn(48)

        A[:10] = np.NaN
        B[-10:] = np.NaN

        result = func(A, B, 20, min_periods=5)

        self.assert_(np.isnan(result.values[:15]).all())
        self.assert_(not np.isnan(result.values[15:]).any())

        self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
コード例 #18
0
class PandasGroupby(SparklingPandasTestCase):

    def setUp(self):
        """
        Setup the dataframes used for the groupby tests derived from pandas
        """
        self.dateRange = bdate_range('1/1/2005', periods=250)
        self.stringIndex = Index([rands(8).upper() for x in range(250)])

        self.groupId = Series([x[0] for x in self.stringIndex],
                              index=self.stringIndex)
        self.groupDict = dict((k, v)
                              for k, v in compat.iteritems(self.groupId))

        self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])

        randMat = np.random.randn(250, 5)
        self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
                                      index=self.stringIndex)

        self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
                                    index=self.dateRange)
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                                   'foo', 'bar', 'foo', 'foo'],
                             'B': ['one', 'one', 'two', 'three',
                                   'two', 'two', 'one', 'three'],
                             'C': np.random.randn(8),
                             'D': np.random.randn(8)})

        self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                                                'foo', 'bar', 'foo', 'foo'],
                                          'B': ['one', 'one', 'two', 'three',
                                                'two', 'two', 'one', 'three'],
                                          'C': np.random.randn(8),
                                          'D': np.array(np.random.randn(8),
                                                        dtype='float32')})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3), index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                            'bar', 'bar', 'bar', 'bar',
                                            'foo', 'foo', 'foo'],
                                      'B': ['one', 'one', 'one', 'two',
                                            'one', 'one', 'one', 'two',
                                            'two', 'two', 'one'],
                                      'C': ['dull', 'dull', 'shiny', 'dull',
                                            'dull', 'shiny', 'shiny', 'dull',
                                            'shiny', 'shiny', 'shiny'],
                                      'D': np.random.randn(11),
                                      'E': np.random.randn(11),
                                      'F': np.random.randn(11)})
        super(self.__class__, self).setUp()

    def test_first_last_nth(self):
        # tests for first / last / nth
        ddf = self.psc.from_data_frame(self.df)
        assert_frame_equal(ddf.collect(), self.df)
        grouped = self.psc.from_data_frame(self.df).groupby('A')
        first = grouped.first().collect()
        expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(first, expected)

        nth = grouped.nth(0).collect()
        assert_frame_equal(nth, expected)

        last = grouped.last().collect()
        expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
        expected.index = Index(['bar', 'foo'], name='A')
        assert_frame_equal(last, expected)

        nth = grouped.nth(-1).collect()
        assert_frame_equal(nth, expected)

        nth = grouped.nth(1).collect()
        expected = self.df.ix[[2, 3], ['B', 'C', 'D']].copy()
        expected.index = Index(['foo', 'bar'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(nth, expected)

    @unittest2.expectedFailure
    def test_getitem(self):
        # it works!
        grouped['B'].first()
        grouped['B'].last()
        grouped['B'].nth(0)

        self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
        self.assertTrue(com.isnull(grouped['B'].first()['foo']))
        self.assertTrue(com.isnull(grouped['B'].last()['foo']))
        # not sure what this is testing
        self.assertTrue(com.isnull(grouped['B'].nth(0)[0]))

    @unittest2.expectedFailure
    def test_new_in0140(self):
        """
        Test new functionality in 0.14.0. This currently doesn't work.
        """
        # v0.14.0 whatsnew
        df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
        ddf = self.psc.from_data_frame(df)
        g = ddf.groupby('A')
        result = g.first().collect()
        expected = df.iloc[[1, 2]].set_index('A')
        assert_frame_equal(result, expected)

        expected = df.iloc[[1, 2]].set_index('A')
        result = g.nth(0, dropna='any').collect()
        assert_frame_equal(result, expected)

    @unittest2.expectedFailure
    def test_first_last_nth_dtypes(self):
        """
        We do groupby fine on mixed types, but our copy from local dataframe
        ends up re-running the guess type function, so the dtypes don't match.
        Issue #25
        """
        df = self.df_mixed_floats.copy()
        df['E'] = True
        df['F'] = 1

        # tests for first / last / nth
        grouped = ddf.groupby('A')
        first = grouped.first().collect()
        expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(first, expected)

        last = grouped.last().collect()
        expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(last, expected)

        nth = grouped.nth(1).collect()
        expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
        expected.index = Index(['bar', 'foo'], name='A')
        expected = expected.sort_index()
        assert_frame_equal(nth, expected)

    def test_var_on_multiplegroups(self):
        df = DataFrame({'data1': np.random.randn(5),
                        'data2': np.random.randn(5),
                        'data3': np.random.randn(5),
                        'key1': ['a', 'a', 'b', 'b', 'a'],
                        'key2': ['one', 'two', 'one', 'two', 'one']})
        ddf = self.psc.from_data_frame(df)
        dgrouped = ddf.groupby(['key1', 'key2'])
        grouped = df.groupby(['key1', 'key2'])
        assert_frame_equal(dgrouped.var().collect(), grouped.var())

    def test_agg_api(self):
        # Note: needs a very recent version of pandas to pass
        # TODO(holden): Pass this test if local fails
        # GH 6337
        # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
        # different api for agg when passed custom function with mixed frame

        df = DataFrame({'data1': np.random.randn(5),
                        'data2': np.random.randn(5),
                        'key1': ['a', 'a', 'b', 'b', 'a'],
                        'key2': ['one', 'two', 'one', 'two', 'one']})
        ddf = self.psc.from_data_frame(df)
        dgrouped = ddf.groupby('key1')
        grouped = df.groupby('key1')

        def peak_to_peak(arr):
            return arr.max() - arr.min()

        expected = grouped.agg([peak_to_peak])
        expected.columns = ['data1', 'data2']
        result = dgrouped.agg(peak_to_peak).collect()
        assert_frame_equal(result, expected)

    def test_agg_regression1(self):
        grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
        dgrouped = self.psc.from_data_frame(
            self.tsframe).groupby(
            [lambda x: x.year, lambda x: x.month])
        result = dgrouped.agg(np.mean).collect()
        expected = grouped.agg(np.mean)
        assert_frame_equal(result, expected)
コード例 #19
0
ファイル: test_expressions.py プロジェクト: BRGM/Pic-EAU
_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64')
_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64')
_mixed = DataFrame({'A': _frame['A'].copy(),
                    'B': _frame['B'].astype('float32'),
                    'C': _frame['C'].astype('int64'),
                    'D': _frame['D'].astype('int32')})
_mixed2 = DataFrame({'A': _frame2['A'].copy(),
                     'B': _frame2['B'].astype('float32'),
                     'C': _frame2['C'].astype('int64'),
                     'D': _frame2['D'].astype('int32')})
_integer = DataFrame(
    np.random.randint(1, 100,
                      size=(10001, 4)), columns=list('ABCD'), dtype='int64')
_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)),
                      columns=list('ABCD'), dtype='int64')
_frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=(
    _frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy()))
_frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3),
                           ItemC=_frame2.copy(), ItemD=_frame2.copy()))
_integer_panel = Panel(dict(ItemA=_integer, ItemB=(_integer + 34).astype(
    'int64')))
_integer2_panel = Panel(dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype(
    'int64')))
_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3)))
_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))


class TestExpressions(tm.TestCase):

    _multiprocess_can_split_ = False
コード例 #20
0
ファイル: test_multilevel.py プロジェクト: MLnick/pandas
class TestMultiLevel(unittest.TestCase):

    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=['A', 'B', 'C'])

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

    def test_pickle(self):
        import cPickle
        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s[42:65]
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s[49:51]
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s[42:65]).all())
        self.assert_(notnull(s[:42]).all())
        self.assert_(notnull(s[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_fancy_2d(self):
        result = self.frame.ix['foo', 'B']
        expected = self.frame.xs('foo')['B']
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix['B', 'foo']
        expected = ft.xs('B')['foo']
        assert_series_equal(result, expected)

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000,2):(2000,4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception,
                          self.frame.delevel()['A'].sortlevel)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked['foo'], df['foo'].stack())
        self.assert_(stacked['bar'].dtype == np.float_)

    def test_swaplevel(self):
        swapped = self.frame['A'].swaplevel(0, 1)
        self.assert_(not swapped.index.equals(self.frame.index))

        back = swapped.swaplevel(0, 1)
        self.assert_(back.index.equals(self.frame.index))

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        pass

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 1, 0, 1, 1],
                                   [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df['foo'].values[:] = 0
        self.assert_((df['foo'].values == 0).all())

        # but not if it's mixed-type
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(0, axis=1)
        df['foo']['one'] = 2
        self.assert_((df['foo', 'one'] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df['foo']
        result2 = df.ix[:, 'foo']
        expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs('foo')
        result2 = df.ix['foo']
        expected = df.reindex(df.index[arrays[0] == 'foo'])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
        ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s['qux']
        result2 = s.ix['qux']
        expected = s[arrays[0] == 'qux']
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)