Exemple #1
0
def test_string_groupby_non_key(str_data, str_data_raise, num_cols):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype='str')
        gdf[i] = Series(str_data, dtype='str')
    pdf['a'] = other_data
    gdf['a'] = other_data

    expectation = raise_builder([str_data_raise], GDFError)

    with expectation:
        expect = pdf.groupby('a', as_index=False).count()
        got = gdf.groupby('a', as_index=False).count()

        expect = expect.sort_values(['a']).reset_index(drop=True)
        got = got.sort_values(['a']).reset_index(drop=True)

        assert_eq(expect, got)

        expect = pdf.groupby('a', as_index=False).max()
        got = gdf.groupby('a', as_index=False).max()

        expect = expect.sort_values(['a']).reset_index(drop=True)
        got = got.sort_values(['a']).reset_index(drop=True)

        if len(expect) == 0 and len(got) == 0:
            for i in range(num_cols):
                expect[i] = expect[i].astype('str')

        assert_eq(expect, got)

        expect = pdf.groupby('a', as_index=False).min()
        got = gdf.groupby('a', as_index=False).min()

        expect = expect.sort_values(['a']).reset_index(drop=True)
        got = got.sort_values(['a']).reset_index(drop=True)

        if len(expect) == 0 and len(got) == 0:
            for i in range(num_cols):
                expect[i] = expect[i].astype('str')

        assert_eq(expect, got)
Exemple #2
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf["key"] = [0, 0, 1, 1, 2, 2, 0]
    gdf["val"] = [0, 1, 2, 3, 4, 5, 6]
    gdf["mult"] = gdf["key"] * gdf["val"]
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(["key", "val"]).mult.sum()
    pdg = pdf.groupby(["key", "val"]).mult.sum()
    assert_eq(pdg, gdg)
Exemple #3
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf['key'] = [0, 0, 1, 1, 2, 2, 0]
    gdf['val'] = [0, 1, 2, 3, 4, 5, 6]
    gdf['mult'] = gdf['key'] * gdf['val']
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(['key', 'val']).mult.sum()
    pdg = pdf.groupby(['key', 'val']).mult.sum()
    assert_eq(pdg, gdg)
Exemple #4
0
def test_string_groupby_non_key(str_data, num_cols):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby("a", as_index=False).count()
    got = gdf.groupby("a", as_index=False).count()

    expect = expect.sort_values(["a"]).reset_index(drop=True)
    got = got.sort_values(["a"]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)

    expect = pdf.groupby("a", as_index=False).max()
    got = gdf.groupby("a", as_index=False).max()

    expect = expect.sort_values(["a"]).reset_index(drop=True)
    got = got.sort_values(["a"]).reset_index(drop=True)

    if len(expect) == 0 and len(got) == 0:
        for i in range(num_cols):
            expect[i] = expect[i].astype("str")

    assert_eq(expect, got, check_dtype=False)

    expect = pdf.groupby("a", as_index=False).min()
    got = gdf.groupby("a", as_index=False).min()

    expect = expect.sort_values(["a"]).reset_index(drop=True)
    got = got.sort_values(["a"]).reset_index(drop=True)

    if len(expect) == 0 and len(got) == 0:
        for i in range(num_cols):
            expect[i] = expect[i].astype("str")

    assert_eq(expect, got, check_dtype=False)
Exemple #5
0
def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)
Exemple #6
0
def test_string_groupby_key_index():
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['a'] = pd.Series(str_data, dtype="str")
    gdf['a'] = Series(str_data, dtype="str")
    pdf['b'] = other_data
    gdf['b'] = other_data

    expect = pdf.groupby('a').count()
    got = gdf.groupby('a').count()

    assert_eq(expect, got)
Exemple #7
0
def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for grp in df.groupby(['key1', 'key2'], method="cudf"):
        pddf = grp.to_pandas()
        for k in 'key1,key2'.split(','):
            assert_values_equal(pddf[k].values)
Exemple #8
0
def test_groupby_cats():
    df = DataFrame()
    df['cats'] = pd.Categorical(list('aabaacaab'))
    df['vals'] = np.random.random(len(df))

    cats = np.asarray(list(df['cats']))
    vals = df['vals'].to_array()

    grouped = df.groupby(['cats'], method="cudf").mean()

    got_vals = grouped['vals']
    got_cats = grouped['cats']

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)
Exemple #9
0
def test_string_groupby_key_index():
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['a'] = pd.Series(str_data, dtype="str")
    gdf['a'] = Series(str_data, dtype="str")
    pdf['b'] = other_data
    gdf['b'] = other_data

    expect = pdf.groupby('a').count()
    with pytest.raises(NotImplementedError,
                       match="Strings are not yet supported in the index"):
        got = gdf.groupby('a').count()

        assert_eq(expect, got)
Exemple #10
0
def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)
Exemple #11
0
def test_groupby_as_df():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    df, segs = df.groupby(['key1', 'key2'], method="cudf").as_df()
    for s, e in zip(segs, list(segs[1:]) + [None]):
        grp = df[s:e]
        pddf = grp.to_pandas()
        for k in 'key1,key2'.split(','):
            assert_values_equal(pddf[k].values)
Exemple #12
0
def test_groupby_cats(method):
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = np.asarray(list(df["cats"]))
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], method=method, as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)
Exemple #13
0
def test_groupby_apply_grouped():
    from numba import cuda

    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
    got_grpby = df.groupby(["key1", "key2"], method="cudf")

    def foo(key1, val1, com1, com2):
        for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x):
            com1[i] = key1[i] * 10000 + val1[i]
            com2[i] = i

    got = got_grpby.apply_grouped(
        foo,
        incols=["key1", "val1"],
        outcols={
            "com1": np.float64,
            "com2": np.int32
        },
        tpb=8,
    )

    got = got.to_pandas()

    # Get expected result by emulating the operation in pandas
    def emulate(df):
        df["com1"] = df.key1 * 10000 + df.val1
        df["com2"] = np.arange(len(df), dtype=np.int32)
        return df

    expect = expect_grpby.apply(emulate)
    expect = expect.sort_values(["key1", "key2"]).reset_index(drop=True)

    pd.util.testing.assert_frame_equal(expect, got)
Exemple #14
0
def test_string_groupby_key(str_data, str_data_raise, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype='str')
        gdf[i] = Series(str_data, dtype='str')
    pdf['a'] = other_data
    gdf['a'] = other_data

    expectation = raise_builder([str_data_raise], GDFError)

    with expectation:
        expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
        got = gdf.groupby(list(range(num_keys)), as_index=False).count()

        expect = expect.sort_values([0]).reset_index(drop=True)
        got = got.sort_values([0]).reset_index(drop=True)

        assert_eq(expect, got)
Exemple #15
0
def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False)
    got_grpby = df.groupby(['key1', 'key2'], method="cudf")

    def foo(df):
        df['out'] = df['val1'] + df['val2']
        return df

    expect = expect_grpby.apply(foo)
    expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True)

    got = got_grpby.apply(foo).to_pandas()
    pd.util.testing.assert_frame_equal(expect, got)