def test_string_groupby_non_key(str_data, str_data_raise, num_cols): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype='str') gdf[i] = Series(str_data, dtype='str') pdf['a'] = other_data gdf['a'] = other_data expectation = raise_builder([str_data_raise], GDFError) with expectation: expect = pdf.groupby('a', as_index=False).count() got = gdf.groupby('a', as_index=False).count() expect = expect.sort_values(['a']).reset_index(drop=True) got = got.sort_values(['a']).reset_index(drop=True) assert_eq(expect, got) expect = pdf.groupby('a', as_index=False).max() got = gdf.groupby('a', as_index=False).max() expect = expect.sort_values(['a']).reset_index(drop=True) got = got.sort_values(['a']).reset_index(drop=True) if len(expect) == 0 and len(got) == 0: for i in range(num_cols): expect[i] = expect[i].astype('str') assert_eq(expect, got) expect = pdf.groupby('a', as_index=False).min() got = gdf.groupby('a', as_index=False).min() expect = expect.sort_values(['a']).reset_index(drop=True) got = got.sort_values(['a']).reset_index(drop=True) if len(expect) == 0 and len(got) == 0: for i in range(num_cols): expect[i] = expect[i].astype('str') assert_eq(expect, got)
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf["key"] = [0, 0, 1, 1, 2, 2, 0] gdf["val"] = [0, 1, 2, 3, 4, 5, 6] gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() gdg = gdf.groupby(["key", "val"]).mult.sum() pdg = pdf.groupby(["key", "val"]).mult.sum() assert_eq(pdg, gdg)
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf['key'] = [0, 0, 1, 1, 2, 2, 0] gdf['val'] = [0, 1, 2, 3, 4, 5, 6] gdf['mult'] = gdf['key'] * gdf['val'] pdf = gdf.to_pandas() gdg = gdf.groupby(['key', 'val']).mult.sum() pdg = pdf.groupby(['key', 'val']).mult.sum() assert_eq(pdg, gdg)
def test_string_groupby_non_key(str_data, num_cols): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = pdf.groupby("a", as_index=False).count() got = gdf.groupby("a", as_index=False).count() expect = expect.sort_values(["a"]).reset_index(drop=True) got = got.sort_values(["a"]).reset_index(drop=True) assert_eq(expect, got, check_dtype=False) expect = pdf.groupby("a", as_index=False).max() got = gdf.groupby("a", as_index=False).max() expect = expect.sort_values(["a"]).reset_index(drop=True) got = got.sort_values(["a"]).reset_index(drop=True) if len(expect) == 0 and len(got) == 0: for i in range(num_cols): expect[i] = expect[i].astype("str") assert_eq(expect, got, check_dtype=False) expect = pdf.groupby("a", as_index=False).min() got = gdf.groupby("a", as_index=False).min() expect = expect.sort_values(["a"]).reset_index(drop=True) got = got.sort_values(["a"]).reset_index(drop=True) if len(expect) == 0 and len(got) == 0: for i in range(num_cols): expect[i] = expect[i].astype("str") assert_eq(expect, got, check_dtype=False)
def test_string_groupby_key_index(): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf["a"] = pd.Series(str_data, dtype="str") gdf["a"] = Series(str_data, dtype="str") pdf["b"] = other_data gdf["b"] = other_data expect = pdf.groupby("a").count() got = gdf.groupby("a").count() assert_eq(expect, got, check_dtype=False)
def test_string_groupby_key_index(): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf['a'] = pd.Series(str_data, dtype="str") gdf['a'] = Series(str_data, dtype="str") pdf['b'] = other_data gdf['b'] = other_data expect = pdf.groupby('a').count() got = gdf.groupby('a').count() assert_eq(expect, got)
def test_groupby_iterate_groups(): np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) for grp in df.groupby(['key1', 'key2'], method="cudf"): pddf = grp.to_pandas() for k in 'key1,key2'.split(','): assert_values_equal(pddf[k].values)
def test_groupby_cats(): df = DataFrame() df['cats'] = pd.Categorical(list('aabaacaab')) df['vals'] = np.random.random(len(df)) cats = np.asarray(list(df['cats'])) vals = df['vals'].to_array() grouped = df.groupby(['cats'], method="cudf").mean() got_vals = grouped['vals'] got_cats = grouped['cats'] for c, v in zip(got_cats, got_vals): print(c, v) expect = vals[cats == c].mean() np.testing.assert_almost_equal(v, expect)
def test_string_groupby_key_index(): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] pdf = pd.DataFrame() gdf = DataFrame() pdf['a'] = pd.Series(str_data, dtype="str") gdf['a'] = Series(str_data, dtype="str") pdf['b'] = other_data gdf['b'] = other_data expect = pdf.groupby('a').count() with pytest.raises(NotImplementedError, match="Strings are not yet supported in the index"): got = gdf.groupby('a').count() assert_eq(expect, got)
def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = pdf.groupby(list(range(num_keys)), as_index=False).count() got = gdf.groupby(list(range(num_keys)), as_index=False).count() expect = expect.sort_values([0]).reset_index(drop=True) got = got.sort_values([0]).reset_index(drop=True) assert_eq(expect, got, check_dtype=False)
def test_groupby_as_df(): np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) df, segs = df.groupby(['key1', 'key2'], method="cudf").as_df() for s, e in zip(segs, list(segs[1:]) + [None]): grp = df[s:e] pddf = grp.to_pandas() for k in 'key1,key2'.split(','): assert_values_equal(pddf[k].values)
def test_groupby_cats(method): df = DataFrame() df["cats"] = pd.Categorical(list("aabaacaab")) df["vals"] = np.random.random(len(df)) cats = np.asarray(list(df["cats"])) vals = df["vals"].to_array() grouped = df.groupby(["cats"], method=method, as_index=False).mean() got_vals = grouped["vals"] got_cats = grouped["cats"] for c, v in zip(got_cats, got_vals): print(c, v) expect = vals[cats == c].mean() np.testing.assert_almost_equal(v, expect)
def test_groupby_apply_grouped(): from numba import cuda np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) got_grpby = df.groupby(["key1", "key2"], method="cudf") def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): com1[i] = key1[i] * 10000 + val1[i] com2[i] = i got = got_grpby.apply_grouped( foo, incols=["key1", "val1"], outcols={ "com1": np.float64, "com2": np.int32 }, tpb=8, ) got = got.to_pandas() # Get expected result by emulating the operation in pandas def emulate(df): df["com1"] = df.key1 * 10000 + df.val1 df["com2"] = np.arange(len(df), dtype=np.int32) return df expect = expect_grpby.apply(emulate) expect = expect.sort_values(["key1", "key2"]).reset_index(drop=True) pd.util.testing.assert_frame_equal(expect, got)
def test_string_groupby_key(str_data, str_data_raise, num_keys): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype='str') gdf[i] = Series(str_data, dtype='str') pdf['a'] = other_data gdf['a'] = other_data expectation = raise_builder([str_data_raise], GDFError) with expectation: expect = pdf.groupby(list(range(num_keys)), as_index=False).count() got = gdf.groupby(list(range(num_keys)), as_index=False).count() expect = expect.sort_values([0]).reset_index(drop=True) got = got.sort_values([0]).reset_index(drop=True) assert_eq(expect, got)
def test_groupby_apply(): np.random.seed(0) df = DataFrame() nelem = 20 df['key1'] = np.random.randint(0, 3, nelem) df['key2'] = np.random.randint(0, 2, nelem) df['val1'] = np.random.random(nelem) df['val2'] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False) got_grpby = df.groupby(['key1', 'key2'], method="cudf") def foo(df): df['out'] = df['val1'] + df['val2'] return df expect = expect_grpby.apply(foo) expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True) got = got_grpby.apply(foo).to_pandas() pd.util.testing.assert_frame_equal(expect, got)