Python DataFrame.groupby Examples

Programming Language: Python

Namespace/Package Name: cudf.core

Class/Type: DataFrame

Method/Function: groupby

Examples at hotexamples.com: 12

Python DataFrame.groupby - 12 examples found. These are the top rated real world Python examples of cudf.core.DataFrame.groupby extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

from_pandas(30)

DataFrame(30)

to_pandas(18)

set_index(16)

merge(15)

groupby(12)

index(5)

_from_table(4)

one_hot_encoding(4)

query(3)

_concat(3)

join(3)

label_encoding(3)

from_records(3)

columns(3)

head(2)

from_gpu_matrix(2)

scatter_by_map(2)

sort_values(2)

to_records(2)

nlargest(1)

nsmallest(1)

from_arrow(1)

copy(1)

replace(1)

sort_index(1)

to_arrow(1)

clip(1)

Example #1

Show file

def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
    # The number of digits after the decimal to use.
    decimal_digits = 2
    # The number of digits before the decimal to use.
    whole_digits = 2

    scale = 10 ** whole_digits
    nelem = num_groups * nelem_per_group

    # The unique is necessary because otherwise if there are duplicates idxmin
    # and idxmax may return different results than pandas (see
    # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to
    # the current version of the test, because idxmin and idxmax simply don't
    # work with pandas Series composed of Decimal objects (see
    # https://github.com/pandas-dev/pandas/issues/40685). However, if that is
    # ever enabled, then this issue will crop up again so we may as well have
    # it fixed now.
    x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
    y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))

    if x.size < y.size:
        total_elements = x.size
        y = y[: x.size]
    else:
        total_elements = y.size
        x = x[: y.size]

    # Note that this filtering can lead to one group with fewer elements, but
    # that shouldn't be a problem and is probably useful to test.
    idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements]

    decimal_x = pd.Series([Decimal(str(d)) for d in x])
    decimal_y = pd.Series([Decimal(str(d)) for d in y])

    pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y})
    gdf = DataFrame(
        {
            "idx": idx_col,
            "x": cudf.Series(decimal_x),
            "y": cudf.Series(decimal_y),
        }
    )

    expect_df = pdf.groupby("idx", sort=True).agg(func)
    if rmm._cuda.gpu.runtimeGetVersion() < 11000:
        with pytest.raises(RuntimeError):
            got_df = gdf.groupby("idx", sort=True).agg(func)
    else:
        got_df = gdf.groupby("idx", sort=True).agg(func)
        assert_eq(expect_df["x"], got_df["x"], check_dtype=False)
        assert_eq(expect_df["y"], got_df["y"], check_dtype=False)

Example #2

Show file

def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf["key"] = [0, 0, 1, 1, 2, 2, 0]
    gdf["val"] = [0, 1, 2, 3, 4, 5, 6]
    gdf["mult"] = gdf["key"] * gdf["val"]
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(["key", "val"]).mult.sum()
    pdg = pdf.groupby(["key", "val"]).mult.sum()
    assert_eq(pdg, gdg)

Example #3

Show file

File: test_string.py Project: trevorsm7/cudf

def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)

Example #4

Show file

def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for name, grp in df.groupby(["key1", "key2"]):
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)

Example #5

Show file

def test_groupby_cats():
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = df["cats"].values_host
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for i in range(len(got_vals)):
        expect = vals[cats == got_cats[i]].mean()
        np.testing.assert_almost_equal(got_vals[i], expect)

Example #6

Show file

File: test_string.py Project: trevorsm7/cudf

def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)

Example #7

Show file

File: test_groupby.py Project: williamBlazing/cudf

def test_groupby_as_df():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    df, segs = df.groupby(["key1", "key2"], method="cudf").as_df()
    for s, e in zip(segs, list(segs[1:]) + [None]):
        grp = df[s:e]
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)

Example #8

Show file

File: test_groupby.py Project: williamBlazing/cudf

def test_groupby_cats(method):
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = np.asarray(list(df["cats"]))
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], method=method, as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)

Example #9

Show file

def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
    got_grpby = df.groupby(["key1", "key2"])

    def foo(df):
        df["out"] = df["val1"] + df["val2"]
        return df

    expect = expect_grpby.apply(foo)
    got = got_grpby.apply(foo)
    assert_eq(expect, got)

Example #10

Show file

File: test_groupby.py Project: miguelusque/cudf

def test_groupby_apply_grouped():
    from numba import cuda

    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"],
                                          as_index=False,
                                          sort=True)
    got_grpby = df.groupby(["key1", "key2"], sort=True)

    def foo(key1, val1, com1, com2):
        for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x):
            com1[i] = key1[i] * 10000 + val1[i]
            com2[i] = i

    got = got_grpby.apply_grouped(
        foo,
        incols=["key1", "val1"],
        outcols={
            "com1": np.float64,
            "com2": np.int32
        },
        tpb=8,
    )

    got = got.to_pandas()

    # Get expected result by emulating the operation in pandas
    def emulate(df):
        df["com1"] = df.key1 * 10000 + df.val1
        df["com2"] = np.arange(len(df), dtype=np.int32)
        return df

    expect = expect_grpby.apply(emulate)
    expect = expect.sort_values(["key1", "key2"])

    assert_eq(expect, got)

Example #11

Show file

File: test_groupby.py Project: williamBlazing/cudf

def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
    got_grpby = df.groupby(["key1", "key2"], method="cudf")

    def foo(df):
        df["out"] = df["val1"] + df["val2"]
        return df

    expect = expect_grpby.apply(foo)
    expect = expect.sort_values(["key1", "key2"]).reset_index(drop=True)

    got = got_grpby.apply(foo).to_pandas()
    pd.util.testing.assert_frame_equal(expect, got)

Example #12

Show file

File: test_string.py Project: trevorsm7/cudf

def test_string_groupby_non_key(str_data, num_cols, agg):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = getattr(pdf.groupby("a", as_index=False), agg)()
    got = getattr(gdf.groupby("a", as_index=False), agg)()

    expect = expect.sort_values(["a"]).reset_index(drop=True)
    got = got.sort_values(["a"]).reset_index(drop=True)

    if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0:
        for i in range(num_cols):
            expect[i] = expect[i].astype("str")

    assert_eq(expect, got, check_dtype=False)