def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b").mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) self.assert_numpy_array_equal(result, exp)
def test_groupby_categorical_index(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) df = DataFrame( np.repeat( np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) df['cats'] = cats # with a cat index result = df.set_index('cats').groupby(level=0).sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index result = df.groupby('cats').sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected)
def test_groupby_categorical_index(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) df['cats'] = cats # with a cat index result = df.set_index('cats').groupby(level=0).sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex(Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index result = df.groupby('cats').sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex(Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected)
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list('abc'), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical(list('aaa'), categories=['a', 'b'], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({ 'missing': missing, 'dense': dense, 'values': values }) grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product([ Categorical(['a', 'b'], ordered=ordered), Categorical(['a', 'b', 'c'], ordered=ordered) ], names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], names=['missing', 'dense']) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_groupby_categorical(self): levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]) expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index( ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index( ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_groupby_preserve_categorical_dtype(self): # GH13743, GH13854 df = DataFrame({ 'A': [1, 2, 1, 1, 2], 'B': [10, 16, 22, 28, 34], 'C1': Categorical(list("abaab"), categories=list("bac"), ordered=False), 'C2': Categorical(list("abaab"), categories=list("bac"), ordered=True) }) # single grouper exp_full = DataFrame({ 'A': [2.0, 1.0, np.nan], 'B': [25.0, 20.0, np.nan], 'C1': Categorical(list("bac"), categories=list("bac"), ordered=False), 'C2': Categorical(list("bac"), categories=list("bac"), ordered=True) }) for col in ['C1', 'C2']: result1 = df.groupby(by=col, as_index=False).mean() result2 = df.groupby(by=col, as_index=True).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) # multiple grouper exp_full = DataFrame({ 'A': [1, 1, 1, 2, 2, 2], 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, np.nan], 'C1': Categorical(list("bacbac"), categories=list("bac"), ordered=False), 'C2': Categorical(list("bacbac"), categories=list("bac"), ordered=True) }) for cols in [['A', 'C1'], ['A', 'C2']]: result1 = df.groupby(by=cols, as_index=False).mean() result2 = df.groupby(by=cols, as_index=True).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected)
def test_groupby_multi_categorical_as_index(self): # GH13204 df = DataFrame({ 'cat': Categorical([1, 2, 2], [1, 2, 3]), 'A': [10, 11, 11], 'B': [101, 102, 103] }) result = df.groupby(['cat', 'A'], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # function grouper f = lambda r: df.loc[r, 'A'] result = df.groupby(['cat', f], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # is original index dropped? expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) group_columns = ['cat', 'A'] for name in [None, 'X', 'B', 'cat']: df.index = Index(list("abc"), name=name) if name in group_columns and name in df.index.names: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.groupby(group_columns, as_index=False).sum() else: result = df.groupby(group_columns, as_index=False).sum() tm.assert_frame_equal(result, expected, check_index_type=True)
def test_groupby_categorical_two_columns(self): # https://github.com/pandas-dev/pandas/issues/8138 d = { 'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40] } test = pd.DataFrame(d) # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", ordered=True) exp = DataFrame({ "ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan] }, index=exp_index) tm.assert_frame_equal(res, exp) # Grouping on two columns groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({ "val": [10, 30, 20, 40, np.nan, np.nan], "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), "ints": [1, 2, 1, 2, 1, 2] }).set_index(["cat", "ints"]) tm.assert_frame_equal(res, exp) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = test[(test.cat == c) & (test.ints == i)] assert_frame_equal(result, expected) d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} test = pd.DataFrame(d) values = pd.cut(test['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = test.groupby([values, 'C2']) res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product([ Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), [1, 2, 3, 4] ], names=["cat", "C2"]) exp = DataFrame( { "C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34] }, index=idx) tm.assert_frame_equal(res, exp)