def test_mixed_none_concat(): df, df2 = generate_none_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] df_equals(pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))
def test_ray_concat(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2]), pandas.concat([df, df2]) )
def test_mixed_inner_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] assert modin_df_equals_pandas(pd.concat(mixed_dfs, join="inner"), pandas.concat([df, df2, df3], join="inner"))
def test_mixed_inner_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] assert (ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'), pandas.concat([df, df2, df3], join='inner')))
def test_ray_concat_on_column(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1)) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"), pandas.concat([df, df2], axis="columns"))
def test_ray_concat_on_column(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) assert modin_df_equals_pandas(pd.concat([modin_df, modin_df2], axis=1), pandas.concat([df, df2], axis=1)) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2], axis="columns"), pandas.concat([df, df2], axis="columns"), )
def test_ray_concat_on_index(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'), pandas.concat([df, df2], axis='index')) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'), pandas.concat([df, df2], axis='rows')) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0))
def test_sort_order(sort, join, axis): pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"]) pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"]) modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( pandas_concat, modin_concat, ) assert list(pandas_concat.columns) == list(modin_concat.columns)
def test_ray_concat_on_index(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df), from_pandas(df2) assert ray_df_equals_pandas( pd.concat([ray_df, ray_df2], axis="index"), pandas.concat([df, df2], axis="index"), ) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="rows"), pandas.concat([df, df2], axis="rows")) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0))
def test_concat_dictionary(axis): pandas_df, pandas_df2 = generate_dfs() modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) df_equals( pd.concat({ "A": modin_df, "B": modin_df2 }, axis=axis), pandas.concat({ "A": pandas_df, "B": pandas_df2 }, axis=axis), )
def test_ray_concat_with_series(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) pandas_series = pandas.Series([1, 2, 3, 4], name="new_col") assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2, pandas_series], axis=0), pandas.concat([df, df2, pandas_series], axis=0), ) assert modin_df_equals_pandas( pd.concat([modin_df, modin_df2, pandas_series], axis=1), pandas.concat([df, df2, pandas_series], axis=1), )
def test_shift_freq(groupby_axis, shift_axis): pandas_df = pandas.DataFrame({ "col1": [1, 0, 2, 3], "col2": [4, 5, np.NaN, 7], "col3": [np.NaN, np.NaN, 12, 10], "col4": [17, 13, 16, 15], }) modin_df = from_pandas(pandas_df) new_index = pandas.date_range("1/12/2020", periods=4, freq="S") if groupby_axis == 0 and shift_axis == 0: pandas_df.index = modin_df.index = new_index by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]] else: pandas_df.index = modin_df.index = new_index pandas_df.columns = modin_df.columns = new_index by = [[0, 1, 0, 2]] for _by in by: pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis) modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis) eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(axis=shift_axis, freq="S"), )
def test_multi_column_groupby(): pandas_df = pandas.DataFrame( { "col1": np.random.randint(0, 100, size=1000), "col2": np.random.randint(0, 100, size=1000), "col3": np.random.randint(0, 100, size=1000), "col4": np.random.randint(0, 100, size=1000), "col5": np.random.randint(0, 100, size=1000), }, index=["row{}".format(i) for i in range(1000)], ) ray_df = from_pandas(pandas_df) by = ["col1", "col2"] with pytest.warns(UserWarning): ray_df.groupby(by).count() with pytest.warns(UserWarning): for k, _ in ray_df.groupby(by): assert isinstance(k, tuple) by = ["row0", "row1"] with pytest.raises(KeyError): ray_df.groupby(by, axis=1).count()
def test_ray_concat_on_index(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals( pd.concat([modin_df, modin_df2], axis="index"), pandas.concat([df, df2], axis="index"), ) df_equals( pd.concat([modin_df, modin_df2], axis="rows"), pandas.concat([df, df2], axis="rows"), ) df_equals(pd.concat([modin_df, modin_df2], axis=0), pandas.concat([df, df2], axis=0))
def test_concat_on_column(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals(pd.concat([modin_df, modin_df2], axis=1), pandas.concat([df, df2], axis=1)) df_equals( pd.concat([modin_df, modin_df2], axis="columns"), pandas.concat([df, df2], axis="columns"), ) modin_result = pd.concat([pd.Series(np.ones(10)), pd.Series(np.ones(10))], axis=1, ignore_index=True) pandas_result = pandas.concat( [pandas.Series(np.ones(10)), pandas.Series(np.ones(10))], axis=1, ignore_index=True, ) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes)
def test_agg_func_None_rename(): pandas_df = pandas.DataFrame( { "col1": np.random.randint(0, 100, size=1000), "col2": np.random.randint(0, 100, size=1000), "col3": np.random.randint(0, 100, size=1000), "col4": np.random.randint(0, 100, size=1000), }, index=["row{}".format(i) for i in range(1000)], ) modin_df = from_pandas(pandas_df) modin_result = modin_df.groupby(["col1", "col2"]).agg(max=("col3", np.max), min=("col3", np.min)) pandas_result = pandas_df.groupby(["col1", "col2"]).agg(max=("col3", np.max), min=("col3", np.min)) df_equals(modin_result, pandas_result)
def test_mixed_dtypes_groupby(): frame_data = np.random.randint(97, 198, size=(2 ** 6, 2 ** 4)) pandas_df = pandas.DataFrame(frame_data).add_prefix("col") # Convert every other column to string for col in pandas_df.iloc[ :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] ]: pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] ray_df = from_pandas(pandas_df) n = 1 ray_groupby = ray_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_idxmax(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) test_cumsum(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) test_cummax(ray_groupby, pandas_groupby) # TODO Add more apply functions apply_functions = [lambda df: df.sum(), min] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_dtypes(ray_groupby, pandas_groupby) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_cummin(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) test_ngroup(ray_groupby, pandas_groupby) test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_head(ray_groupby, pandas_groupby, n) test_cumprod(ray_groupby, pandas_groupby) test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_tail(ray_groupby, pandas_groupby, n) test_quantile(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby) test___getattr__(ray_groupby, pandas_groupby) test_groups(ray_groupby, pandas_groupby)
def test_series_groupby(by, as_index_series_or_dataframe): if as_index_series_or_dataframe <= 1: as_index = as_index_series_or_dataframe == 1 series_data = np.random.randint(97, 198, size=2**8) modin_series = pd.Series(series_data) pandas_series = pandas.Series(series_data) else: as_index = True pandas_series = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [3, 8, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], }) modin_series = from_pandas(pandas_series) if isinstance(by, np.ndarray) or by is None: by = np.random.randint(0, 100, size=len(pandas_series.index)) n = 1 try: pandas_groupby = pandas_series.groupby(by, as_index=as_index) if as_index_series_or_dataframe == 2: pandas_groupby = pandas_groupby["col1"] except Exception as e: with pytest.raises(type(e)): modin_series.groupby(by, as_index=as_index) else: modin_groupby = modin_series.groupby(by, as_index=as_index) if as_index_series_or_dataframe == 2: modin_groupby = modin_groupby["col1"] modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_cumprod(modin_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval_groups(modin_groupby, pandas_groupby)
def test_simple_col_groupby(): pandas_df = pandas.DataFrame({ "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], "col3": [3, 8, 2, 10], "col4": [1, 13, 6, 15], "col5": [-4, 5, 6, -7], }) modin_df = from_pandas(pandas_df) by = [1, 2, 3, 2, 1] modin_groupby = modin_df.groupby(axis=1, by=by) pandas_groupby = pandas_df.groupby(axis=1, by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_ndim(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_quantile(modin_groupby, pandas_groupby) # https://github.com/pandas-dev/pandas/issues/21127 # eval_cumsum(modin_groupby, pandas_groupby) # eval_cummax(modin_groupby, pandas_groupby) # eval_cummin(modin_groupby, pandas_groupby) # eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) # Pandas fails on this case with ValueError # eval_ngroup(modin_groupby, pandas_groupby) # eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval_groups(modin_groupby, pandas_groupby)
def test_mixed_dtypes_groupby(as_index): frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) pandas_df = pandas.DataFrame(frame_data).add_prefix("col") # Convert every other column to string for col in pandas_df.iloc[:, [ i for i in range(len(pandas_df.columns)) if i % 2 == 0 ]]: pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] modin_df = from_pandas(pandas_df) n = 1 by_values = [ ("col1", ), (lambda x: x % 2, ), (modin_df["col0"].copy(), pandas_df["col0"].copy()), ] for by in by_values: modin_groupby = modin_df.groupby(by=by[0], as_index=as_index) pandas_groupby = pandas_df.groupby(by=by[-1], as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) eval_cummax(modin_groupby, pandas_groupby) # TODO Add more apply functions apply_functions = [lambda df: df.sum(), min] # Workaround for Pandas bug #34656. Recreate groupby object for Pandas pandas_groupby = pandas_df.groupby(by=by[-1], as_index=as_index) for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) transform_functions = [lambda df: df, lambda df: df + df] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval___getattr__(modin_groupby, pandas_groupby, "col2") eval_groups(modin_groupby, pandas_groupby)
def test_large_row_groupby(): pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD")) modin_df = from_pandas(pandas_df) by = [str(i) for i in pandas_df["A"].tolist()] n = 4 modin_groupby = modin_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) # eval_prod(modin_groupby, pandas_groupby) causes overflows eval_std(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) # eval_cumprod(modin_groupby, pandas_groupby) causes overflows eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval_groups(modin_groupby, pandas_groupby)
def test_single_group_row_groupby(): pandas_df = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, 36, 7], "col3": [3, 8, 12, 10], "col4": [17, 3, 16, 15], "col5": [-4, 5, -6, -7], }) modin_df = from_pandas(pandas_df) by = ["1", "1", "1", "1"] n = 6 modin_groupby = modin_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval___getattr__(modin_groupby, pandas_groupby, "col2") eval_groups(modin_groupby, pandas_groupby)
def test_simple_row_groupby(by, as_index): pandas_df = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, np.NaN, 7], "col3": [np.NaN, np.NaN, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], }) modin_df = from_pandas(pandas_df) n = 1 modin_groupby = modin_df.groupby(by=by, as_index=as_index) pandas_groupby = pandas_df.groupby(by=by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs eval_cumsum(modin_groupby, pandas_groupby) eval_cummax(modin_groupby, pandas_groupby) eval_cummin(modin_groupby, pandas_groupby) eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) # Workaround for Pandas bug #34656. Recreate groupby object for Pandas pandas_groupby = pandas_df.groupby(by=by, as_index=as_index) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) if not check_df_columns_have_nans(modin_df, by): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) eval___getattr__(modin_groupby, pandas_groupby, "col3") eval_groups(modin_groupby, pandas_groupby)
def test_ray_concat(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), pandas.concat([df, df2]))
def test_invalid_axis_errors(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) with pytest.raises(ValueError): pd.concat([modin_df, modin_df2], axis=2)
def test_single_group_row_groupby(): pandas_df = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 36, 7], "col3": [3, 8, 12, 10], "col4": [17, 3, 16, 15], "col5": [-4, 5, -6, -7], } ) ray_df = from_pandas(pandas_df) by = ["1", "1", "1", "1"] n = 6 ray_groupby = ray_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_idxmax(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) test_cumsum(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) test_cummax(ray_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_dtypes(ray_groupby, pandas_groupby) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_cummin(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_rank(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) test_ngroup(ray_groupby, pandas_groupby) test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_head(ray_groupby, pandas_groupby, n) test_cumprod(ray_groupby, pandas_groupby) test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_tail(ray_groupby, pandas_groupby, n) test_quantile(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby) test___getattr__(ray_groupby, pandas_groupby) test_groups(ray_groupby, pandas_groupby)
def test_large_row_groupby(): pandas_df = pandas.DataFrame( np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD") ) ray_df = from_pandas(pandas_df) by = [str(i) for i in pandas_df["A"].tolist()] n = 4 ray_groupby = ray_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_idxmax(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) test_cumsum(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) test_cummax(ray_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_dtypes(ray_groupby, pandas_groupby) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_cummin(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) # test_prod(ray_groupby, pandas_groupby) causes overflows test_std(ray_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_rank(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) test_ngroup(ray_groupby, pandas_groupby) test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_head(ray_groupby, pandas_groupby, n) # test_cumprod(ray_groupby, pandas_groupby) causes overflows test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_tail(ray_groupby, pandas_groupby, n) test_quantile(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby) test_groups(ray_groupby, pandas_groupby)
def test_simple_row_groupby(by, as_index, col1_category): pandas_df = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, np.NaN, 7], "col3": [np.NaN, np.NaN, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], }) if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) modin_df = from_pandas(pandas_df) n = 1 def maybe_get_columns(df, by): if isinstance(by, list): return [o(df) if isinstance(o, GetColumn) else o for o in by] else: return by modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by), as_index=as_index) pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by)) pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod(axis=0)) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) # Workaround for Pandas bug #34656. Recreate groupby object for Pandas pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.rank()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique()) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) if not check_df_columns_have_nans(modin_df, by): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_general( modin_groupby, pandas_groupby, lambda df: df.transform(func), check_exception_type=None, ) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) if get_current_backend() != "BaseOnPython": eval_general( modin_groupby, pandas_groupby, lambda df: df.size(), check_exception_type=None, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) if isinstance(by, list) and not any( isinstance(o, (pd.Series, pandas.Series)) for o in by): # Not yet supported for non-original-column-from-dataframe Series in by: eval___getattr__(modin_groupby, pandas_groupby, "col3") eval_groups(modin_groupby, pandas_groupby)
def test_invalid_axis_errors(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) with pytest.raises(ValueError): pd.concat([ray_df, ray_df2], axis=2)
def test_simple_col_groupby(): pandas_df = pandas.DataFrame( { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], "col3": [3, 8, 2, 10], "col4": [1, 13, 6, 15], "col5": [-4, 5, 6, -7], } ) ray_df = from_pandas(pandas_df) by = [1, 2, 3, 2, 1] ray_groupby = ray_df.groupby(axis=1, by=by) pandas_groupby = pandas_df.groupby(axis=1, by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) if not PY2: # idxmax and idxmin fail on column groupby in pandas with python2 test_idxmax(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) test_quantile(ray_groupby, pandas_groupby) # https://github.com/pandas-dev/pandas/issues/21127 # test_cumsum(ray_groupby, pandas_groupby) # test_cummax(ray_groupby, pandas_groupby) # test_cummin(ray_groupby, pandas_groupby) # test_cumprod(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) # Pandas fails on this case with ValueError # test_ngroup(ray_groupby, pandas_groupby) # test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby) test___getattr__(ray_groupby, pandas_groupby) test_groups(ray_groupby, pandas_groupby)