Example #1
0
def test_rank_object_raises(ties_method, ascending, na_option,
                            pct, vals):
    df = DataFrame({'key': ['foo'] * 5, 'val': vals})
    with tm.assert_raises_regex(TypeError, "not callable"):
        df.groupby('key').rank(method=ties_method,
                               ascending=ascending,
                               na_option=na_option, pct=pct)
Example #2
0
    def test_crosstab_margins(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
                          margins=True)

        self.assertEqual(result.index.names, ('a',))
        self.assertEqual(result.columns.names, ['b', 'c'])

        all_cols = result['All', '']
        exp_cols = df.groupby(['a']).size().astype('i8')
        exp_cols = exp_cols.append(Series([len(df)], index=['All']))

        tm.assert_series_equal(all_cols, exp_cols)

        all_rows = result.ix['All']
        exp_rows = df.groupby(['b', 'c']).size().astype('i8')
        exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))

        exp_rows = exp_rows.reindex(all_rows.index)
        exp_rows = exp_rows.fillna(0).astype(np.int64)
        tm.assert_series_equal(all_rows, exp_rows)
Example #3
0
    def test_groupby_groups_datetimeindex(self):
        # GH#1430
        periods = 1000
        ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
        df = DataFrame({'high': np.arange(periods),
                        'low': np.arange(periods)}, index=ind)
        grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))

        # it works!
        groups = grouped.groups
        assert isinstance(list(groups.keys())[0], datetime)

        # GH#11442
        index = pd.date_range('2015/01/01', periods=5, name='date')
        df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
                           'B': [1, 2, 3, 4, 5]}, index=index)
        result = df.groupby(level='date').groups
        dates = ['2015-01-05', '2015-01-04', '2015-01-03',
                 '2015-01-02', '2015-01-01']
        expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
                    for date in dates}
        tm.assert_dict_equal(result, expected)

        grouped = df.groupby(level='date')
        for date in dates:
            result = grouped.get_group(date)
            data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
            expected_index = pd.DatetimeIndex([date], name='date')
            expected = pd.DataFrame(data,
                                    columns=list('AB'),
                                    index=expected_index)
            tm.assert_frame_equal(result, expected)
Example #4
0
    def test_nunique(self):
        df = DataFrame({
            'A': list('abbacc'),
            'B': list('abxacc'),
            'C': list('abbacx'),
        })

        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
        result = df.groupby('A', as_index=False).nunique()
        tm.assert_frame_equal(result, expected)

        # as_index
        expected.index = list('abc')
        expected.index.name = 'A'
        result = df.groupby('A').nunique()
        tm.assert_frame_equal(result, expected)

        # with na
        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
        tm.assert_frame_equal(result, expected)

        # dropna
        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                             index=list('abc'))
        expected.index.name = 'A'
        result = df.replace({'x': None}).groupby('A').nunique()
        tm.assert_frame_equal(result, expected)
Example #5
0
def test_group_selection_cache():
    # GH 12839 nth, head, and tail should return same result consistently
    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
    expected = df.iloc[[0, 2]].set_index('A')

    g = df.groupby('A')
    result1 = g.head(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.tail(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.head(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.tail(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)
Example #6
0
def test_rank_apply():
    lev1 = tm.rands_array(10, 100)
    lev2 = tm.rands_array(10, 130)
    lab1 = np.random.randint(0, 100, size=500)
    lab2 = np.random.randint(0, 130, size=500)

    df = DataFrame({'value': np.random.randn(500),
                    'key1': lev1.take(lab1),
                    'key2': lev2.take(lab2)})

    result = df.groupby(['key1', 'key2']).value.rank()

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank())
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)

    result = df.groupby(['key1', 'key2']).value.rank(pct=True)

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank(pct=True))
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)
Example #7
0
    def setup(self, dtype, method, application):
        if method in method_blacklist.get(dtype, {}):
            raise NotImplementedError  # skip benchmark
        ngroups = 1000
        size = ngroups * 2
        rng = np.arange(ngroups)
        values = rng.take(np.random.randint(0, ngroups, size=size))
        if dtype == 'int':
            key = np.random.randint(0, size, size=size)
        elif dtype == 'float':
            key = np.concatenate([np.random.random(ngroups) * 0.1,
                                  np.random.random(ngroups) * 10.0])
        elif dtype == 'object':
            key = ['foo'] * size
        elif dtype == 'datetime':
            key = date_range('1/1/2011', periods=size, freq='s')

        df = DataFrame({'values': values, 'key': key})

        if application == 'transform':
            if method == 'describe':
                raise NotImplementedError

            self.as_group_method = lambda: df.groupby(
                'key')['values'].transform(method)
            self.as_field_method = lambda: df.groupby(
                'values')['key'].transform(method)
        else:
            self.as_group_method = getattr(df.groupby('key')['values'], method)
            self.as_field_method = getattr(df.groupby('values')['key'], method)
Example #8
0
    def test_multi_iter_frame(self, three_group):
        k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
        k2 = np.array(['1', '2', '1', '2', '1', '2'])
        df = DataFrame({'v1': np.random.randn(6),
                        'v2': np.random.randn(6),
                        'k1': k1, 'k2': k2},
                       index=['one', 'two', 'three', 'four', 'five', 'six'])

        grouped = df.groupby(['k1', 'k2'])

        # things get sorted!
        iterated = list(grouped)
        idx = df.index
        expected = [('a', '1', df.loc[idx[[4]]]),
                    ('a', '2', df.loc[idx[[3, 5]]]),
                    ('b', '1', df.loc[idx[[0, 2]]]),
                    ('b', '2', df.loc[idx[[1]]])]
        for i, ((one, two), three) in enumerate(iterated):
            e1, e2, e3 = expected[i]
            assert e1 == one
            assert e2 == two
            assert_frame_equal(three, e3)

        # don't iterate through groups with no data
        df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
        df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
        grouped = df.groupby(['k1', 'k2'])
        groups = {key: gp for key, gp in grouped}
        assert len(groups) == 2

        # axis = 1
        three_levels = three_group.groupby(['A', 'B', 'C']).mean()
        grouped = three_levels.T.groupby(axis=1, level=(1, 2))
        for key, group in grouped:
            pass
Example #9
0
def test__cython_agg_general(op, targop):
    df = DataFrame(np.random.randn(1000))
    labels = np.random.randint(0, 50, size=1000).astype(float)

    result = df.groupby(labels)._cython_agg_general(op)
    expected = df.groupby(labels).agg(targop)
    tm.assert_frame_equal(result, expected)
Example #10
0
def test_preserve_categorical_dtype():
    # GH13743, GH13854
    df = DataFrame({'A': [1, 2, 1, 1, 2],
                    'B': [10, 16, 22, 28, 34],
                    'C1': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=False),
                    'C2': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=True)})
    # single grouper
    exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
                          'B': [25.0, 20.0, np.nan],
                          'C1': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=False),
                          'C2': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=True)})
    for col in ['C1', 'C2']:
        result1 = df.groupby(by=col, as_index=False, observed=False).mean()
        result2 = df.groupby(
            by=col, as_index=True, observed=False).mean().reset_index()
        expected = exp_full.reindex(columns=result1.columns)
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)
Example #11
0
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432, adapted for GH25871
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.array([[1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2]], int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = np.array([[4, 2],
                                  [4, 2],
                                  [4, 2],
                                  [4, 2],
                                  [4, 2]], int)
        expected_columns = CategoricalIndex(categories,
                                            categories=categories,
                                            ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
Example #12
0
def test_preserve_categories():
    # GH-13179
    categories = list('abc')

    # ordered=True
    df = DataFrame({'A': pd.Categorical(list('ba'),
                                        categories=categories,
                                        ordered=True)})
    index = pd.CategoricalIndex(categories, categories, ordered=True)
    tm.assert_index_equal(
        df.groupby('A', sort=True, observed=False).first().index, index)
    tm.assert_index_equal(
        df.groupby('A', sort=False, observed=False).first().index, index)

    # ordered=False
    df = DataFrame({'A': pd.Categorical(list('ba'),
                                        categories=categories,
                                        ordered=False)})
    sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
    nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
                                       ordered=False)
    tm.assert_index_equal(
        df.groupby('A', sort=True, observed=False).first().index,
        sort_index)
    tm.assert_index_equal(
        df.groupby('A', sort=False, observed=False).first().index,
        nosort_index)
def test_deferred_with_groupby():

    # GH 12486
    # support deferred resample ops with groupby
    data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
            ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
            ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
            ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
            ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]

    df = DataFrame(data, columns=['date', 'id', 'score'])
    df.date = pd.to_datetime(df.date)

    def f(x):
        return x.set_index('date').resample('D').asfreq()
    expected = df.groupby('id').apply(f)
    result = df.set_index('date').groupby('id').resample('D').asfreq()
    assert_frame_equal(result, expected)

    df = DataFrame({'date': pd.date_range(start='2016-01-01',
                                          periods=4,
                                          freq='W'),
                    'group': [1, 1, 2, 2],
                    'val': [5, 6, 7, 8]}).set_index('date')

    def f(x):
        return x.resample('1D').ffill()
    expected = df.groupby('group').apply(f)
    result = df.groupby('group').resample('1D').ffill()
    assert_frame_equal(result, expected)
Example #14
0
 def test_cython_agg_nothing_to_agg_with_dates(self):
     frame = DataFrame({'a': np.random.randint(0, 5, 50),
                        'b': ['foo', 'bar'] * 25,
                        'dates': pd.date_range('now', periods=50,
                                               freq='T')})
     with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"):
         frame.groupby('b').dates.mean()
Example #15
0
def test_aggregate_with_nat(func, fill_value):
    # check TimeGrouper's aggregation is identical as normal groupby
    # if NaT is included, 'var', 'std', 'mean', 'first','last'
    # and 'nth' doesn't work yet

    n = 20
    data = np.random.randn(n, 4).astype('int64')
    normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    normal_df['key'] = [1, 2, np.nan, 4, 5] * 4

    dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
                    datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4

    normal_grouped = normal_df.groupby('key')
    dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

    normal_result = getattr(normal_grouped, func)()
    dt_result = getattr(dt_grouped, func)()

    pad = DataFrame([[fill_value] * 4], index=[3],
                    columns=['A', 'B', 'C', 'D'])
    expected = normal_result.append(pad)
    expected = expected.sort_index()
    expected.index = date_range(start='2013-01-01', freq='D',
                                periods=5, name='key')
    assert_frame_equal(expected, dt_result)
    assert dt_result.index.name == 'key'
Example #16
0
def test_cython_agg_nothing_to_agg_with_dates():
    frame = DataFrame({'a': np.random.randint(0, 5, 50),
                       'b': ['foo', 'bar'] * 25,
                       'dates': pd.date_range('now', periods=50, freq='T')})
    msg = "No numeric types to aggregate"
    with pytest.raises(DataError, match=msg):
        frame.groupby('b').dates.mean()
Example #17
0
def test_resample_timegrouper():
    # GH 7227
    dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
              datetime(2014, 11, 5), datetime(2014, 9, 5),
              datetime(2014, 10, 8), datetime(2014, 7, 15)]

    dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
    dates3 = [pd.NaT] + dates1 + [pd.NaT]

    for dates in [dates1, dates2, dates3]:
        df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
        result = df.set_index('A').resample('M').count()
        exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
                                    '2014-09-30',
                                    '2014-10-31', '2014-11-30'],
                                   freq='M', name='A')
        expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)

        df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
            len(dates))))
        result = df.set_index('A').resample('M').count()
        expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
                             index=exp_idx, columns=['B', 'C'])
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)
Example #18
0
    def test_crosstab_margins(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)

        df = DataFrame({"a": a, "b": b, "c": c})

        result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)

        self.assertEqual(result.index.names, ("a",))
        self.assertEqual(result.columns.names, ["b", "c"])

        all_cols = result["All", ""]
        exp_cols = df.groupby(["a"]).size().astype("i8")
        exp_cols = exp_cols.append(Series([len(df)], index=["All"]))
        exp_cols.name = ("All", "")

        tm.assert_series_equal(all_cols, exp_cols)

        all_rows = result.ix["All"]
        exp_rows = df.groupby(["b", "c"]).size().astype("i8")
        exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")]))
        exp_rows.name = "All"

        exp_rows = exp_rows.reindex(all_rows.index)
        exp_rows = exp_rows.fillna(0).astype(np.int64)
        tm.assert_series_equal(all_rows, exp_rows)
Example #19
0
class Size(object):

    def setup(self):
        n = 10**5
        offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
        dates = np.datetime64('now') + offsets
        self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
                             'key2': np.random.randint(0, 100, size=n),
                             'value1': np.random.randn(n),
                             'value2': np.random.randn(n),
                             'value3': np.random.randn(n),
                             'dates': dates})
        self.draws = Series(np.random.randn(n))
        labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
        self.cats = labels.astype('category')

    def time_multi_size(self):
        self.df.groupby(['key1', 'key2']).size()

    def time_dt_timegrouper_size(self):
        with warnings.catch_warnings(record=True):
            self.df.groupby(TimeGrouper(key='dates', freq='M')).size()

    def time_category_size(self):
        self.draws.groupby(self.cats).size()
Example #20
0
 def test_groupby_corner(self):
     midx = MultiIndex(levels=[['foo'],['bar'],['baz']],
                       labels=[[0],[0],[0]], names=['one','two','three'])
     df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'],
                    index=midx)
     # should work
     df.groupby(level='three')
Example #21
0
def test_agg_datetimes_mixed():
    data = [[1, '2012-01-01', 1.0],
            [2, '2012-01-02', 2.0],
            [3, None, 3.0]]

    df1 = DataFrame({'key': [x[0] for x in data],
                     'date': [x[1] for x in data],
                     'value': [x[2] for x in data]})

    data = [[row[0],
             (dt.datetime.strptime(row[1], '%Y-%m-%d').date()
              if row[1] else None),
             row[2]]
            for row in data]

    df2 = DataFrame({'key': [x[0] for x in data],
                     'date': [x[1] for x in data],
                     'value': [x[2] for x in data]})

    df1['weights'] = df1['value'] / df1['value'].sum()
    gb1 = df1.groupby('date').aggregate(np.sum)

    df2['weights'] = df1['value'] / df1['value'].sum()
    gb2 = df2.groupby('date').aggregate(np.sum)

    assert (len(gb1) == len(gb2))
Example #22
0
def test_groupby_as_index_apply(df):
    # GH #4648 and #3417
    df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'user_id': [1, 2, 1, 1, 3, 1],
                    'time': range(6)})

    g_as = df.groupby('user_id', as_index=True)
    g_not_as = df.groupby('user_id', as_index=False)

    res_as = g_as.head(2).index
    res_not_as = g_not_as.head(2).index
    exp = Index([0, 1, 2, 4])
    tm.assert_index_equal(res_as, exp)
    tm.assert_index_equal(res_not_as, exp)

    res_as_apply = g_as.apply(lambda x: x.head(2)).index
    res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index

    # apply doesn't maintain the original ordering
    # changed in GH5610 as the as_index=False returns a MI here
    exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
        2, 4)])
    tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
    exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])

    tm.assert_index_equal(res_as_apply, exp_as_apply)
    tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)

    ind = Index(list('abcde'))
    df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
    res = df.groupby(0, as_index=False).apply(lambda x: x).index
    tm.assert_index_equal(res, ind)
Example #23
0
    def test_size(self):
        grouped = self.df.groupby(['A', 'B'])
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('A')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('B')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
        for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
            left = df.groupby(key, sort=sort).size()
            right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
            assert_series_equal(left, right, check_names=False)

        # GH11699
        df = DataFrame([], columns=['A', 'B'])
        out = Series([], dtype='int64', index=Index([], name='A'))
        assert_series_equal(df.groupby('A').size(), out)
Example #24
0
    def test_cython_agg_boolean(self):
        frame = DataFrame({'a': np.random.randint(0, 5, 50),
                           'b': np.random.randint(0, 2, 50).astype('bool')})
        result = frame.groupby('a')['b'].mean()
        expected = frame.groupby('a')['b'].agg(np.mean)

        assert_series_equal(result, expected)
Example #25
0
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.ones((5, 4), int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = 2 * np.ones((5, 2), int)

        if observed:
            # if we are not-observed we undergo a reindex
            # so need to adjust the output as our expected sets us up
            # to be non-observed
            expected_columns = CategoricalIndex(['A', 'B'],
                                                categories=categories,
                                                ordered=True)
        else:
            expected_columns = CategoricalIndex(categories,
                                                categories=categories,
                                                ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
Example #26
0
def test_ops_general():
    ops = [('mean', np.mean),
           ('median', np.median),
           ('std', np.std),
           ('var', np.var),
           ('sum', np.sum),
           ('prod', np.prod),
           ('min', np.min),
           ('max', np.max),
           ('first', lambda x: x.iloc[0]),
           ('last', lambda x: x.iloc[-1]),
           ('count', np.size), ]
    try:
        from scipy.stats import sem
    except ImportError:
        pass
    else:
        ops.append(('sem', sem))
    df = DataFrame(np.random.randn(1000))
    labels = np.random.randint(0, 50, size=1000).astype(float)

    for op, targop in ops:
        result = getattr(df.groupby(labels), op)().astype(float)
        expected = df.groupby(labels).agg(targop)
        try:
            tm.assert_frame_equal(result, expected)
        except BaseException as exc:
            exc.args += ('operation: %s' % op, )
            raise
Example #27
0
    def test_timegrouper_with_reg_groups_freq(self, freq):
        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date': pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
                           359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        expected = (
            df.groupby('user_id')['whole_cost']
              .resample(freq)
              .sum(min_count=1)  # XXX
              .dropna()
              .reorder_levels(['date', 'user_id'])
              .sort_index()
              .astype('int64')
        )
        expected.name = 'whole_cost'

        result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
                                           'user_id'])['whole_cost'].sum()
        assert_series_equal(result1, expected)

        result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
            'whole_cost'].sum()
        assert_series_equal(result2, expected)
Example #28
0
def test_aggregate_normal(resample_method):
    """Check TimeGrouper's aggregation is identical as normal groupby."""

    if resample_method == 'ohlc':
        pytest.xfail(reason='DataError: No numeric types to aggregate')

    data = np.random.randn(20, 4)
    normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    normal_df['key'] = [1, 2, 3, 4, 5] * 4

    dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
                    datetime(2013, 1, 3), datetime(2013, 1, 4),
                    datetime(2013, 1, 5)] * 4

    normal_grouped = normal_df.groupby('key')
    dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

    expected = getattr(normal_grouped, resample_method)()
    dt_result = getattr(dt_grouped, resample_method)()
    expected.index = date_range(start='2013-01-01', freq='D',
                                periods=5, name='key')
    tm.assert_equal(expected, dt_result)

    # if TimeGrouper is used included, 'nth' doesn't work yet

    """
Example #29
0
 def test_groupby_max_datetime64(self):
     # GH 5869
     # datetimelike dtype conversion from int
     df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
     expected = df.groupby('A')['A'].apply(lambda x: x.max())
     result = df.groupby('A')['A'].max()
     assert_series_equal(result, expected)
Example #30
0
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]