class TestSeriesAggregate: def test_transform(self, string_series): # transforming functions with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) f_abs = np.abs(string_series) # ufunc result = string_series.apply(np.sqrt) expected = f_sqrt.copy() tm.assert_series_equal(result, expected) # list-like result = string_series.apply([np.sqrt]) expected = f_sqrt.to_frame().copy() expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) result = string_series.apply(["sqrt"]) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting expected = pd.concat([f_sqrt, f_abs], axis=1) expected.columns = ["sqrt", "absolute"] result = string_series.apply([np.sqrt, np.abs]) tm.assert_frame_equal(result, expected) # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] expected = expected.unstack().rename("series") result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) tm.assert_series_equal(result.reindex_like(expected), expected) def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator msg = "cannot combine transform and aggregation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.agg(["sqrt", "max"]) msg = "cannot perform both aggregation and transformation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.agg({"foo": np.sqrt, "bar": "sum"}) def test_demo(self): # demonstration tests s = Series(range(6), dtype="int64", name="series") result = s.agg(["min", "max"]) expected = Series([0, 5], index=["min", "max"], name="series") tm.assert_series_equal(result, expected) result = s.agg({"foo": "min"}) expected = Series([0], index=["foo"], name="series") tm.assert_series_equal(result, expected) # nested renaming msg = "nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): s.agg({"foo": ["min", "max"]}) def test_multiple_aggregators_with_dict_api(self): s = Series(range(6), dtype="int64", name="series") # nested renaming msg = "nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.apply(lambda x: str(x)) expected = string_series.agg(lambda x: str(x)) tm.assert_series_equal(result, expected) result = string_series.apply(str) expected = string_series.agg(str) tm.assert_series_equal(result, expected) def test_with_nested_series(self, datetime_series): # GH 2316 # .agg with a reducer and a transform, what to do result = datetime_series.apply( lambda x: Series([x, x**2], index=["x", "x^2"])) expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) tm.assert_frame_equal(result, expected) result = datetime_series.agg( lambda x: Series([x, x**2], index=["x", "x^2"])) tm.assert_frame_equal(result, expected) def test_replicate_describe(self, string_series): # this also tests a result set that is all scalars expected = string_series.describe() result = string_series.apply({ "count": "count", "mean": "mean", "std": "std", "min": "min", "25%": lambda x: x.quantile(0.25), "50%": "median", "75%": lambda x: x.quantile(0.75), "max": "max", }) tm.assert_series_equal(result, expected) def test_reduce(self, string_series): # reductions with named functions result = string_series.agg(["sum", "mean"]) expected = Series( [string_series.sum(), string_series.mean()], ["sum", "mean"], name=string_series.name, ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("how", ["agg", "apply"]) def test_non_callable_aggregates(self, how): # test agg using non-callable series attributes # GH 39116 - expand to apply s = Series([1, 2, None]) # Calling agg w/ just a string arg same as calling s.arg result = getattr(s, how)("size") expected = s.size assert result == expected # test when mixed w/ callable reducers result = getattr(s, how)(["size", "count", "mean"]) expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series(dtype=np.float64), [ ("sum", 0), ("max", np.nan), ("min", np.nan), ("all", True), ("any", False), ("mean", np.nan), ("prod", 1), ("std", np.nan), ("var", np.nan), ("median", np.nan), ], ), tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("sum", 6), ("max", 3), ("min", 1), ("all", True), ("any", True), ("mean", 2), ("prod", 6), ("std", 1), ("var", 1), ("median", 2), ], ), tm.get_cython_table_params( Series("a b c".split()), [ ("sum", "abc"), ("max", "c"), ("min", "a"), ("all", "c"), # see GH12863 ("any", "a"), ], ), ), ) def test_agg_cython_table(self, series, func, expected): # GH21224 # test reducing functions in # pandas.core.base.SelectionMixin._cython_table result = series.agg(func) if is_number(expected): assert np.isclose(result, expected, equal_nan=True) else: assert result == expected @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series(dtype=np.float64), [ ("cumprod", Series([], Index([]), dtype=np.float64)), ("cumsum", Series([], Index([]), dtype=np.float64)), ], ), tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("cumprod", Series([np.nan, 1, 2, 6])), ("cumsum", Series([np.nan, 1, 3, 6])), ], ), tm.get_cython_table_params(Series( "a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]), ), ) def test_agg_cython_table_transform(self, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) result = series.agg(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series("a b c".split()), [ ("mean", TypeError), # mean raises TypeError ("prod", TypeError), ("std", TypeError), ("var", TypeError), ("median", TypeError), ("cumprod", TypeError), ], )), ) def test_agg_cython_table_raises(self, series, func, expected): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) def test_series_apply_no_suffix_index(self): # GH36189 s = Series([4] * 3) result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) expected = Series([12, 12, 12], index=["sum", "<lambda>", "<lambda>"]) tm.assert_series_equal(result, expected)
return row def transform2(row): if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": row["D"] = 7 return row msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): data.apply(transform, axis=1) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] ), ) def test_agg_cython_table_raises_frame(df, func, expected, axis): # GH 21224 msg = "can't multiply sequence by non-int of type 'str'" with pytest.raises(expected, match=msg): df.agg(func, axis=axis) @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series("a b c".split()), [
result = getattr(float_frame, how)(op) expected = getattr(np, op)(float_frame) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series(dtype=np.float64), [ ("sum", 0), ("max", np.nan), ("min", np.nan), ("all", True), ("any", False), ("mean", np.nan), ("prod", 1), ("std", np.nan), ("var", np.nan), ("median", np.nan), ], ), tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("sum", 6), ("max", 3), ("min", 1), ("all", True), ("any", True),
result = df.agg([func]) expected = expected.to_frame("func").T tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "df, func, expected", chain( tm.get_cython_table_params( DataFrame(), [ ("sum", Series(dtype="float64")), ("max", Series(dtype="float64")), ("min", Series(dtype="float64")), ("all", Series(dtype=bool)), ("any", Series(dtype=bool)), ("mean", Series(dtype="float64")), ("prod", Series(dtype="float64")), ("std", Series(dtype="float64")), ("var", Series(dtype="float64")), ("median", Series(dtype="float64")), ], ), tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("sum", Series([1.0, 3])), ("max", Series([1.0, 2])), ("min", Series([1.0, 1])), ("all", Series([True, True])), ("any", Series([True, True])),
result = getattr(s, how)(["size", "count", "mean"]) expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "series, func, expected", chain( tm.get_cython_table_params( Series(dtype=np.float64), [ ("sum", 0), ("max", np.nan), ("min", np.nan), ("all", True), ("any", False), ("mean", np.nan), ("prod", 1), ("std", np.nan), ("var", np.nan), ("median", np.nan), ], ), tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("sum", 6), ("max", 3), ("min", 1), ("all", True), ("any", True),
return row def transform2(row): if notna(row["C"]) and row["C"].startswith( "shin") and row["A"] == "foo": row["D"] = 7 return row msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): data.apply(transform, axis=1) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params(DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]]), ) def test_agg_cython_table_raises(df, func, expected, axis): # GH 21224 msg = "can't multiply sequence by non-int of type 'str'" with pytest.raises(expected, match=msg): df.agg(func, axis=axis) def test_transform_none_to_type(): # GH#34377 df = DataFrame({"a": [None]}) msg = "Transform function failed" with pytest.raises(ValueError, match=msg): df.transform({"a": int})
class TestDataFrameAggregate: def test_agg_transform(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 with np.errstate(all="ignore"): f_abs = np.abs(float_frame) f_sqrt = np.sqrt(float_frame) # ufunc result = float_frame.transform(np.sqrt, axis=axis) expected = f_sqrt.copy() tm.assert_frame_equal(result, expected) result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) result = float_frame.transform(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( [float_frame.columns, ["sqrt"]]) else: expected.index = pd.MultiIndex.from_product( [float_frame.index, ["sqrt"]]) tm.assert_frame_equal(result, expected) result = float_frame.transform([np.sqrt], axis=axis) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) expected = zip_frames([f_abs, f_sqrt], axis=other_axis) if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( [float_frame.columns, ["absolute", "sqrt"]]) else: expected.index = pd.MultiIndex.from_product( [float_frame.index, ["absolute", "sqrt"]]) tm.assert_frame_equal(result, expected) result = float_frame.transform([np.abs, "sqrt"], axis=axis) tm.assert_frame_equal(result, expected) def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg msg = "transforms cannot produce aggregated results" with pytest.raises(ValueError, match=msg): float_frame.transform(["max", "min"], axis=axis) msg = "cannot combine transform and aggregation operations" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.transform(["max", "sqrt"], axis=axis) df = pd.DataFrame({"A": range(5), "B": 5}) def f(): with np.errstate(all="ignore"): df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) def test_transform_method_name(self, method): # GH 19760 df = pd.DataFrame({"A": [-1, 2]}) result = df.transform(method) expected = operator.methodcaller(method)(df) tm.assert_frame_equal(result, expected) def test_demo(self): # demonstration tests df = pd.DataFrame({"A": range(5), "B": 5}) result = df.agg(["min", "max"]) expected = DataFrame({ "A": [0, 4], "B": [5, 5] }, columns=["A", "B"], index=["min", "max"]) tm.assert_frame_equal(result, expected) result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) expected = DataFrame( { "A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0] }, columns=["A", "B"], index=["max", "min", "sum"], ) tm.assert_frame_equal(result.reindex_like(expected), expected) def test_agg_multiple_mixed_no_warning(self): # GH 20909 mdf = pd.DataFrame({ "A": [1, 2, 3], "B": [1.0, 2.0, 3.0], "C": ["foo", "bar", "baz"], "D": pd.date_range("20130101", periods=3), }) expected = pd.DataFrame( { "A": [1, 6], "B": [1.0, 6.0], "C": ["bar", "foobarbaz"], "D": [pd.Timestamp("2013-01-01"), pd.NaT], }, index=["min", "sum"], ) # sorted index with tm.assert_produces_warning(None): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # For backwards compatibility, the result's index is # still sorted by function name, so it's ['min', 'sum'] # not ['sum', 'min']. expected = expected[["D", "C", "B", "A"]] tm.assert_frame_equal(result, expected) def test_agg_dict_nested_renaming_depr(self): df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() # all reducers expected = pd.concat( [ float_frame.mean(axis=axis), float_frame.max(axis=axis), float_frame.sum(axis=axis), ], axis=1, ) expected.columns = ["mean", "max", "sum"] expected = expected.T if axis in {0, "index"} else expected result = float_frame.agg(["mean", "max", "sum"], axis=axis) tm.assert_frame_equal(result, expected) # dict input with scalars func = OrderedDict([(name1, "mean"), (name2, "sum")]) result = float_frame.agg(func, axis=axis) expected = Series( [ float_frame.loc(other_axis)[name1].mean(), float_frame.loc(other_axis)[name2].sum(), ], index=[name1, name2], ) tm.assert_series_equal(result, expected) # dict input with lists func = OrderedDict([(name1, ["mean"]), (name2, ["sum"])]) result = float_frame.agg(func, axis=axis) expected = DataFrame({ name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]), name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]), }) expected = expected.T if axis in {1, "columns"} else expected tm.assert_frame_equal(result, expected) # dict input with lists with multiple func = OrderedDict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) result = float_frame.agg(func, axis=axis) expected = DataFrame( OrderedDict([ ( name1, Series( [ float_frame.loc(other_axis)[name1].mean(), float_frame.loc(other_axis)[name1].sum(), ], index=["mean", "sum"], ), ), ( name2, Series( [ float_frame.loc(other_axis)[name2].sum(), float_frame.loc(other_axis)[name2].max(), ], index=["sum", "max"], ), ), ])) expected = expected.T if axis in {1, "columns"} else expected tm.assert_frame_equal(result, expected) def test_nuiscance_columns(self): # GH 15015 df = DataFrame({ "A": [1, 2, 3], "B": [1.0, 2.0, 3.0], "C": ["foo", "bar", "baz"], "D": pd.date_range("20130101", periods=3), }) result = df.agg("min") expected = Series( [1, 1.0, "bar", pd.Timestamp("20130101")], index=df.columns) tm.assert_series_equal(result, expected) result = df.agg(["min"]) expected = DataFrame( [[1, 1.0, "bar", pd.Timestamp("20130101")]], index=["min"], columns=df.columns, ) tm.assert_frame_equal(result, expected) result = df.agg("sum") expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) result = df.agg(["sum"]) expected = DataFrame([[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) def test_non_callable_aggregates(self): # GH 16405 # 'size' is a property of frame/series # validate that this is working df = DataFrame({ "A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"] }) # Function aggregate result = df.agg({"A": "count"}) expected = Series({"A": 2}) tm.assert_series_equal(result, expected) # Non-function aggregate result = df.agg({"A": "size"}) expected = Series({"A": 3}) tm.assert_series_equal(result, expected) # Mix function and non-function aggs result1 = df.agg(["count", "size"]) result2 = df.agg({ "A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"] }) expected = pd.DataFrame({ "A": { "count": 2, "size": 3 }, "B": { "count": 2, "size": 3 }, "C": { "count": 2, "size": 3 }, }) tm.assert_frame_equal(result1, result2, check_like=True) tm.assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() result = df.agg("count") expected = df.count() tm.assert_series_equal(result, expected) # Just a string attribute arg same as calling df.arg result = df.agg("size") expected = df.size assert result == expected def test_agg_listlike_result(self): # GH-29587 user defined function returning list-likes df = DataFrame({ "A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"] }) def func(group_col): return list(group_col.dropna().unique()) result = df.agg(func) expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) result = df.agg([func]) expected = expected.to_frame("func").T tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "df, func, expected", chain( tm.get_cython_table_params( DataFrame(), [ ("sum", Series(dtype="float64")), ("max", Series(dtype="float64")), ("min", Series(dtype="float64")), ("all", Series(dtype=bool)), ("any", Series(dtype=bool)), ("mean", Series(dtype="float64")), ("prod", Series(dtype="float64")), ("std", Series(dtype="float64")), ("var", Series(dtype="float64")), ("median", Series(dtype="float64")), ], ), tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("sum", Series([1.0, 3])), ("max", Series([1.0, 2])), ("min", Series([1.0, 1])), ("all", Series([True, True])), ("any", Series([True, True])), ("mean", Series([1, 1.5])), ("prod", Series([1.0, 2])), ("std", Series([np.nan, 0.707107])), ("var", Series([np.nan, 0.5])), ("median", Series([1, 1.5])), ], ), ), ) def test_agg_cython_table(self, df, func, expected, axis): # GH 21224 # test reducing functions in # pandas.core.base.SelectionMixin._cython_table result = df.agg(func, axis=axis) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "df, func, expected", chain( tm.get_cython_table_params(DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]), tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), ) def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) if axis == "columns" or axis == 1: # operating blockwise doesn't let us preserve dtypes expected = expected.astype("float64") result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params(DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]]), ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 msg = "can't multiply sequence by non-int of type 'str'" with pytest.raises(expected, match=msg): df.agg(func, axis=axis) @pytest.mark.parametrize("num_cols", [2, 3, 5]) def test_frequency_is_original(self, num_cols): # GH 22150 index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq def test_apply_datetime_tz_issue(self): # GH 29052 timestamps = [ pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), ] df = DataFrame(data=[0, 1, 2], index=timestamps) result = df.apply(lambda x: x.name, axis=1) expected = pd.Series(index=timestamps, data=timestamps) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "df", [pd.DataFrame({ "A": ["a", None], "B": ["c", "d"] })]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_consistency_of_aggregates_of_columns_with_missing_values( self, df, method): # GH 16832 none_in_first_column_result = getattr(df[["A", "B"]], method)() none_in_second_column_result = getattr(df[["B", "A"]], method)() tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result)