Python group_byの例、datar.dplyr.group_by Pythonの例

コード例 #1

0

ファイルを表示

def test_errors():
    # wrong type
    with pytest.raises(ValueError):
        iris >> group_by(f.Species) >> filter(range(1, 10))
    with pytest.raises(ValueError):
        iris >> filter(range(1, 10))

    # wrong size
    with pytest.raises(ValueError):
        iris >> group_by(f.Species) >> filter([True, False])
    with pytest.raises(ValueError):
        iris >> rowwise(f.Species) >> filter([True, False])
    with pytest.raises(ValueError):
        iris >> filter([True, False])

    # wrong size in column
    with pytest.raises(ValueError):
        iris >> group_by(f.Species) >> filter(tibble([True, False]))
    with pytest.raises(ValueError):
        iris >> rowwise() >> filter(tibble([True, False]))
    with pytest.raises(ValueError):
        iris >> filter(tibble([True, False]))
    with pytest.raises(ValueError):
        tibble(x=1) >> filter([True, False])

    # named inputs
    with pytest.raises(TypeError):
        mtcars >> filter(x=1)
    with pytest.raises(TypeError):
        mtcars >> filter(f.y > 2, z=3)
    with pytest.raises(TypeError):
        mtcars >> filter(True, x=1)

    # across() in filter() does not warn yet
    tibble(x=1, y=2) >> filter(across(everything(), lambda x: x > 0))

コード例 #2

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_compound_ungroup():
    assert ungroup(1) == 1
    g = Series([1, 2, 3]).groupby([1, 1, 2])
    assert ungroup(g) is g.obj

    with pytest.raises(ValueError):
        ungroup(g, "abc")

    df = tibble(x=1, y=2) >> group_by(f.x, f.y)
    out = ungroup(df)
    assert group_vars(out) == []

    out = ungroup(df, f.x)
    assert group_vars(out) == ["y"]

    out = ungroup(df, f.y)
    assert group_vars(out) == ["x"]

    out = group_by(df, f.y, _add=True)
    assert group_vars(out) == ["x", "y"]

    rf = df >> rowwise()
    with pytest.raises(ValueError):
        ungroup(rf, f.x)

    with pytest.raises(KeyError):
        group_by(df, f.w)

コード例 #3

0

ファイルを表示

def test_slice_works_with_grouped_data():
    g = mtcars >> arrange(f.cyl) >> group_by(f.cyl)

    res = slice(g, f[:2])
    exp = filter(g, row_number() < 3)
    assert_frame_equal(res, exp)

    res = slice(g, ~f[:2])
    exp = filter(g, row_number() >= 3)
    assert_tibble_equal(res, exp)

    g = group_by(tibble(x=c(1, 1, 2, 2, 2)), f.x)
    # out = group_keys(slice(g, 3, _preserve=True))
    # assert out.x.tolist() == [1, 2]
    out = group_keys(slice(g, 2, _preserve=False))
    assert out.x.tolist() == [2]

    gf = tibble(x=f[1:4]) >> group_by(
        g=Categorical([1, 1, 2], categories=[1, 2, 3]),
        _drop=False,
    )
    with pytest.raises(TypeError):
        gf >> slice("a")
    with pytest.raises(ValueError):
        gf >> slice(~f[:2], 1)

    out = gf >> slice(0)
    assert out.shape[0] == 2

    out = gf >> slice(
        Series([1, 0, 0]).groupby(gf._datar["grouped"].grouper.result_index))
    assert_iterable_equal(out.x.obj, [2, 3])

コード例 #4

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_summarise_maintains_drop():
    df = tibble(
        f1=factor("a", levels=c("a", "b", "c")),
        f2=factor("d", levels=c("d", "e", "f", "g")),
        x=42,
    )
    res = df >> group_by(f.f1, f.f2, _drop=True)
    ng = n_groups(res)
    assert ng == 1
    assert group_by_drop_default(res)

    # DataFrame.groupby(..., observed=False) doesn't support
    # multiple categoricals
    # res1 = df >> group_by(f.f1, f.f2, _drop=False)
    # ng = n_groups(res1)
    # assert ng == 12

    res1 = df >> group_by(f.f1, _drop=True)
    ng = n_groups(res1)
    assert ng == 1

    res1 = df >> group_by(f.f1, _drop=False)
    ng = n_groups(res1)
    assert ng == 3

    res1 = df >> group_by(f.f2, _drop=False)
    ng = n_groups(res1)
    assert ng == 4

    res2 = res >> summarise(x=sum(f.x), _groups="drop_last")
    ng = n_groups(res2)
    assert ng == 1
    assert group_by_drop_default(res2)

コード例 #5

0

ファイルを表示

ファイル: test_distinct.py プロジェクト: pwwang/datar

def test_switch_groupby_distinct_equal():
    df = tibble(g=c(1, 2), x=c(1, 2))

    df1 = df >> distinct() >> group_by(f.g)
    df2 = df >> group_by(f.g) >> distinct()

    assert df1.equals(df2)

コード例 #6

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_auto_splicing():
    df1 = iris >> group_by(f.Species)
    df2 = iris >> group_by(tibble(Species=iris.Species))
    assert df1.equals(df2)

    df1 = iris >> group_by(f.Species)
    df2 = iris >> group_by(across(f.Species))
    assert df1.equals(df2)

    df1 = (
        iris
        >> mutate(across(starts_with("Sepal"), round))
        >> group_by(f.Sepal_Length, f.Sepal_Width)
    )
    df2 = iris >> group_by(across(starts_with("Sepal"), round))
    assert df1.equals(df2)

    # across(character()), across(NULL) not supported

    df1 = (
        iris
        >> mutate(across(starts_with("Sepal"), round))
        >> group_by(f.Sepal_Length, f.Sepal_Width, f.Species)
    )
    df2 = iris >> group_by(across(starts_with("Sepal"), round), f.Species)
    assert df1.equals(df2)

    df1 = (
        iris
        >> mutate(across(starts_with("Sepal"), round))
        >> group_by(f.Species, f.Sepal_Length, f.Sepal_Width)
    )
    df2 = iris >> group_by(f.Species, across(starts_with("Sepal"), round))
    assert df1.equals(df2)

コード例 #7

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_mutate_semantics():
    df1 = tibble(a=1, b=2) >> group_by(c=f.a * f.b, d=f.c + 1)
    df2 = (
        tibble(a=1, b=2)
        >> mutate(c=f.a * f.b, d=f.c + 1)
        >> group_by(f.c, f.d)
    )
    assert df1.equals(df2)

コード例 #8

0

ファイルを表示

def test_group_map_errors():
    # head1 = lambda df: head(df, 1)

    # group_modify()
    with pytest.raises(ValueError, match="grouping variables"):
        mtcars >> group_by(f.cyl) >> group_modify(lambda df: tibble(cyl=19))
    with pytest.raises(ValueError, match="should be a data frame"):
        mtcars >> group_by(f.cyl) >> group_modify(lambda df: 10)

コード例 #9

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_add(df):
    tbl = df >> group_by(f.x, f.y, _add=True)
    gvars = group_vars(tbl)
    assert gvars == ["x", "y"]

    tbl = df >> group_by(f.x, _add=True) >> group_by(f.y, _add=True)
    gvars = group_vars(tbl)
    assert gvars == ["x", "y"]

コード例 #10

0

ファイルを表示

def test_add_tally_can_be_given_a_weighting_variable():
    df = tibble(a=c(1, 1, 2, 2, 2), w=c(1, 1, 2, 3, 4))

    out = df >> group_by(f.a) >> add_tally(wt=f.w) >> pull(f.n, to="list")
    assert out == [2, 2, 9, 9, 9]

    out = df >> group_by(f.a) >> add_tally(wt=f.w + 1) >> pull(f.n, to="list")
    assert out == [4, 4, 12, 12, 12]

コード例 #11

0

ファイルを表示

def test_handles_scalar_results():
    df1 = mtcars >> filter(min(f.mpg) > 0)
    assert df1.equals(mtcars)

    df2 = (mtcars >> group_by(f.cyl) >> filter(min(f.mpg) > 0) >> arrange(
        f.cyl, f.mpg))
    # See TibbleGrouped's Known issues
    df3 = mtcars >> group_by(f.cyl) >> arrange(f.cyl, f.mpg)
    assert_frame_equal(df2, df3)

コード例 #12

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_mutate_does_not_loose_variables():
    df = tibble(
        a=rep([1, 2, 3, 4], 2), b=rep([1, 2, 3, 4], each=2), x=runif(8)
    )
    by_ab = df >> group_by(f.a, f.b)
    by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last")
    by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4))

    assert by_a_quantile.columns.tolist() == ["a", "b", "x", "quantile"]

コード例 #13

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_remember_drop_False():
    res = (
        iris
        >> filter(f.Species == "setosa")
        >> group_by(f.Species, _drop=False)
    )
    assert not group_by_drop_default(res)

    res2 = res >> group_by(f.Species)
    assert not group_by_drop_default(res2)

コード例 #14

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_0_vars(df):
    gdata = group_data(group_by(iris))
    assert names(gdata) == ["_rows"]
    out = gdata
    assert_iterable_equal(out._rows[0], range(nrow(iris)))

    gdata = group_data(group_by(iris, **{}))
    assert names(gdata) == ["_rows"]
    out = gdata
    assert_iterable_equal(out._rows[0], range(nrow(iris)))

コード例 #15

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_add_passes_drop():
    d = tibble(
        f1=factor("b", levels=c("a", "b", "c")),
        f2=factor("g", levels=c("e", "f", "g")),
        x=48,
    )
    res = group_by(group_by(d, f.f1, _drop=True), f.f2, _add=True)
    ng = n_groups(res)
    assert ng == 1
    assert group_by_drop_default(res)

コード例 #16

0

ファイルを表示

def test_filter_false_handles_indices(caplog):

    out = mtcars >> group_by(f.cyl) >> filter(False, _preserve=True)
    assert "support" in caplog.text
    # out = group_rows(out)
    # assert out == [[], [], []]

    out = mtcars >> group_by(f.cyl) >> filter(False, _preserve=False)
    out = group_rows(out)
    assert out == []

コード例 #17

0

ファイルを表示

def test_deals_with_0_groups():
    df = tibble(x=[]) >> group_by(f.x)
    out = mutate(df, y=f.x + 1)
    exp = tibble(x=[], y=[]) >> group_by(f.x)
    assert_iterable_equal(out, exp)
    assert group_vars(out) == group_vars(exp)

    out = mutate(df, y=max(f.x))
    assert out.shape == (0, 2)
    assert group_vars(out) == ["x"]

コード例 #18

0

ファイルを表示

def test_output_preserves_grouping():
    df = tibble(g=c(1, 2, 2, 2))
    exp = tibble(g=c(1, 2, 2, 2), n=c(1, 3, 3, 3))

    out = df >> add_count(f.g)
    assert out.equals(exp)

    out = df >> group_by(f.g) >> add_count()
    exp >>= group_by(f.g)
    assert out.equals(exp)
    assert group_vars(out) == group_vars(exp)

コード例 #19

0

ファイルを表示

def test_preserve_grouping():
    df = tibble(g=c(1, 2, 2, 2))
    exp = tibble(g=c(1, 2), n=c(1, 3))

    out = df >> count(f.g)
    assert out.equals(exp)

    df1 = df >> group_by(f.g) >> count()
    df2 = exp >> group_by(f.g)
    assert df1.equals(df2)
    assert group_vars(df1) == group_vars(df2)

コード例 #20

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_remember_drop_True():
    res = iris >> group_by(f.Species, _drop=True)
    assert group_by_drop_default(res)

    res2 = res >> filter(f.Sepal_Length > 5)
    assert group_by_drop_default(res2)

    res3 = res >> filter(f.Sepal_Length > 5, _preserve=False)
    assert group_by_drop_default(res3)

    res4 = res3 >> group_by(f.Species)
    assert group_by_drop_default(res4)

コード例 #21

0

ファイルを表示

def test_can_add_tallies_of_a_variable():
    df = tibble(a=c(2, 1, 1))
    out = df >> group_by(f.a) >> add_tally()
    exp = tibble(a=c(2, 1, 1), n=c(1, 2, 2)) >> group_by(f.a)
    assert_frame_equal(out, exp)
    assert group_vars(out) == group_vars(exp)
    # sort
    out = df >> group_by(f.a) >> add_tally(sort=True)
    exp = tibble(a=c(1, 1, 2), n=c(2, 2, 1)) >> group_by(f.a)
    assert out.equals(exp)
    # assert_frame_equal(out, exp)
    assert group_vars(out) == group_vars(exp)

コード例 #22

0

ファイルを表示

def test_errors(caplog):
    df = tibble(x=1, y=2)
    out = df >> group_by(f.x, f.y) >> summarise()
    assert "`summarise()` has grouped output by ['x']" in caplog.text
    assert out.equals(df)
    caplog.clear()

    out = tibble(x=1, y=2) >> group_by(f.x, f.y) >> summarise(z=[2, 2])
    assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text
    exp = tibble(x=[1, 1], y=[2, 2], z=[2, 2])
    assert out.equals(exp)
    caplog.clear()

    out = df >> rowwise(f.x, f.y) >> summarise()
    assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text
    assert out.equals(df)
    caplog.clear()

    out = df >> rowwise() >> summarise()
    assert "`summarise()` has ungrouped output" in caplog.text
    d = dim(out)
    assert d == (1, 0)
    caplog.clear()

    # unsupported type (but python objects are supported by pandas)
    # not testing for types futher
    # tibble(x=1, y=c(1, 2, 2), z=runif(3)) >> summarise(a=object())

    # incompatible size
    with pytest.raises(ValueError):
        tibble(z=1) >> summarise(x=[1, 2, 3], y=[1, 2])
    with pytest.raises(ValueError):
        tibble(z=[1, 2]) >> group_by(f.z) >> summarise(x=[1, 2, 3], y=[1, 2])
    with pytest.raises(ValueError):
        (
            tibble(z=c(1, 3))
            >> group_by(f.z)
            >> summarise(x=seq_len(f.z), y=[1, 2])
        )

    # Missing variable
    with pytest.raises(KeyError):
        summarise(mtcars, a=mean(f.not_there))

    with pytest.raises(KeyError):
        summarise(group_by(mtcars, f.cyl), a=mean(f.not_there))

    # Duplicate column names
    x = 1
    df = tibble(x, x, _name_repair="minimal")
    with pytest.raises(NameNonUniqueError):
        df >> summarise(f.x)

コード例 #23

0

ファイルを表示

def test_group_map_respects_empty_groups():
    res = group_by(mtcars, f.cyl) >> group_map(lambda df: head(df, 2))
    assert len(list(res)) == 3

    res = (iris >> group_by(f.Species) >> filter(f.Species == "setosa") >>
           group_map(tally))
    assert len(list(res)) == 1

    res = (iris >> group_by(f.Species, _drop=False) >>
           filter(f.Species == "setosa") >> group_map.list(tally))
    # filter unable to keep the structure
    # assert len(res) == 3
    assert len(res) == 1

コード例 #24

0

ファイルを表示

def test_joins_preserve_groups():

    gf1 = tibble(a=[1, 2, 3]) >> group_by(f.a)
    gf2 = tibble(a=rep([1, 2, 3, 4], 2), b=1) >> group_by(f.b)

    out = inner_join(gf1, gf2, by="a")
    assert group_vars(out) == ["a"]

    out = semi_join(gf1, gf2, by="a")
    assert group_vars(out) == ["a"]

    # See comment in nest_join
    out = nest_join(gf1, gf2, by="a")
    assert group_vars(out) == ["a"]

コード例 #25

0

ファイルを表示

ファイル: test_group_by.py プロジェクト: pwwang/datar

def test_errors():
    df = tibble(x=1, y=2)

    with pytest.raises(KeyError):
        df >> group_by(f.unknown)

    with pytest.raises(ValueError):
        df >> ungroup(f.x)

    with pytest.raises(KeyError):
        df >> group_by(f.x, f.y) >> ungroup(f.z)

    with pytest.raises(KeyError):
        df >> group_by(z=f.a + 1)

コード例 #26

0

ファイルを表示

def test_slice_handles_df_columns():
    df = tibble(x=[1, 2],
                y=tibble(a=[1, 2], b=[3, 4]),
                z=tibble(A=[1, 2], B=[3, 4]))
    out = slice(df, 0)
    assert out.equals(df.iloc[[0], :])

    gdf = group_by(df, f.x)
    assert slice(gdf, 0).equals(gdf)
    # TODO: group_by a stacked df is not supported yet
    gdf = group_by(df, f["y$a"], f["y$b"])
    assert slice(gdf, 0).equals(gdf)
    gdf = group_by(df, f["z$A"], f["z$B"])
    assert slice(gdf, 0).equals(gdf)

コード例 #27

0

ファイルを表示

def test_preserve_order_across_groups():
    df = tibble(g=c(1, 2, 1, 2, 1), time=[5, 4, 3, 2, 1], x=f.time)
    res1 = (df >> group_by(f.g) >> filter(f.x <= 4) >> ungroup() >> arrange(
        f.g, f.time))

    res2 = (df >> arrange(f.g) >> group_by(f.g) >> filter(f.x <= 4) >>
            ungroup() >> arrange(f.g, f.time))

    res3 = (df >> filter(f.x <= 4) >> group_by(f.g) >> ungroup() >> arrange(
        f.g, f.time))
    res1.reset_index(drop=True, inplace=True)
    res2.reset_index(drop=True, inplace=True)
    res3.reset_index(drop=True, inplace=True)
    assert res1.equals(res2)
    assert res1.equals(res3)

コード例 #28

0

ファイルを表示

def test_group_column_names_reflect_renamed_duplicate_columns():
    # test_that("group column names reflect renamed duplicate columns (#2330)", {
    df1 = tibble(x=range(1, 6), y=range(1, 6)) >> group_by(f.x, f.y)
    df2 = tibble(x=range(1, 6), y=range(1, 6))

    out = inner_join(df1, df2, by="x")
    assert group_vars(out) == ["x"]

コード例 #29

0

ファイルを表示

ファイル: test_distinct.py プロジェクト: pwwang/datar

def test_not_duplicating_cols():
    df = tibble(a=[1, 2, 3], b=[4, 5, 6])
    out = df >> distinct(f.a, f.a)
    assert out.columns.tolist() == ["a"]

    out = df >> group_by(f.a) >> distinct(f.a)
    assert out.columns.tolist() == ["a"]

コード例 #30

0

ファイルを表示

def test_input_recycled():
    df1 = tibble() >> summarise(x=1, y=[1, 2, 3], z=1)
    df2 = tibble(x=1, y=[1, 2, 3], z=1)
    assert df1.equals(df2)

    gf = group_by(tibble(a=[1, 2]), f.a)
    df1 = gf >> summarise(x=1, y=[1, 2, 3], z=1)
    df2 = tibble(
        a=rep([1, 2], each=3), x=1, y=rep([1, 2, 3], 2), z=1
    ) >> group_by(f.a)
    assert_tibble_equal(df1, df2)

    df1 = gf >> summarise(x=seq_len(f.a), y=1)
    df2 = tibble(a=c(1, 2, 2), x=c(1, 1, 2), y=1) >> group_by(f.a)
    # assert df1.equals(df2)
    assert_tibble_equal(df1, df2)