def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
Exemple #2
0
def summarize_fd_by_subject(df):

    return (
        df
        >> p.group_by("subject_id", "condition", "data_id", "headcase")
        >> p.summarize(
            fd_mean="mean(FramewiseDisplacement)",
            fd_median="median(FramewiseDisplacement)",
            fd_mean_filter="filter_mean(FramewiseDisplacement)",
            fd_median_filter="filter_median(FramewiseDisplacement)",
            perc_spikes="perc_high_motion(FramewiseDisplacement)",
        )
        >> p.do(
            lambda df: df.melt(
                id_vars=["subject_id", "data_id", "condition", "headcase"],
                value_vars=[
                    "fd_mean",
                    "fd_median",
                    "fd_mean_filter",
                    "fd_median_filter",
                    "perc_spikes",
                ],
                var_name="measure",
                value_name="val",
            )
        )
        >> p.arrange("subject_id")
        >> p.call(".reset_index", drop=True)
    )
def test_do():
    df = pd.DataFrame({
        'x': [1, 2, 2, 3],
        'y': [2, 3, 4, 3],
        'z': list('aabb'),
        'w': pd.Categorical(list('aabb')),
    })

    def least_squares(gdf):
        X = np.vstack([gdf.x, np.ones(len(gdf))]).T
        (m, c), _, _, _ = np.linalg.lstsq(X, gdf.y, None)
        return pd.DataFrame({'slope': [m], 'intercept': c})

    def slope(x, y):
        return np.diff(y)[0] / np.diff(x)[0]

    def intercept(x, y):
        return y.values[0] - slope(x, y) * x.values[0]

    df1 = df >> group_by('z') >> do(least_squares)
    df2 = df >> group_by('z') >> do(
        slope=lambda gdf: slope(gdf.x, gdf.y),
        intercept=lambda gdf: intercept(gdf.x, gdf.y))

    df3 = df >> group_by('w') >> do(least_squares)
    df4 = df >> group_by('w') >> do(
        slope=lambda gdf: slope(gdf.x, gdf.y),
        intercept=lambda gdf: intercept(gdf.x, gdf.y))

    assert df1.plydata_groups == ['z']
    assert df2.plydata_groups == ['z']
    assert df1['z'].dtype == object
    assert df2['z'].dtype == object
    assert df3['w'].dtype == 'category'
    assert df4['w'].dtype == 'category'

    npt.assert_array_equal(df1['z'], df2['z'])
    npt.assert_array_almost_equal(df1['intercept'], df2['intercept'])
    npt.assert_array_almost_equal(df1['slope'], df2['slope'])

    # No groups (Test with pass-through functions)
    df1 = df >> do(lambda gdf: gdf)
    df2 = df >> do(x=lambda gdf: gdf.x,
                   y=lambda gdf: gdf.y,
                   z=lambda gdf: gdf.z,
                   w=lambda gdf: gdf.w)

    cols = list('xyzw')
    assert all(df[cols] == df1[cols])
    assert all(df[cols] == df2[cols])

    # Reordered data so that the groups are not all
    # bunched together
    df = pd.DataFrame(
        {
            'x': [2, 1, 2, 3],
            'y': [4, 2, 3, 3],
            'z': list('baab'),
            'w': pd.Categorical(list('baab')),
        },
        index=[3, 1, 0, 2]  # good index
    )

    dfi = pd.DataFrame(
        {
            'x': [2, 1, 2, 3],
            'y': [4, 2, 3, 3],
            'z': list('baab'),
            'w': pd.Categorical(list('baab')),
        },
        index=[3, 1, 0, 0]  # bad index
    )

    # Reuse group dataframe
    def sum_x(gdf):
        gdf['sum_x'] = gdf['x'].sum()
        return gdf

    # When the group dataframe is reused and the
    # index is good (no duplicates) the rows
    # in the result should not be reordered
    res = df >> group_by('z') >> do(sum_x)
    assert df['x'].equals(res['x'])
    assert all(res['sum_x'] == [5, 3, 3, 5])

    # Can use string evaluation
    res = df >> group_by('z') >> do(n='len(x)')
    assert all(res['z'] == ['b', 'a'])
    assert all(res['n'] == [2, 2])

    # bad index is handled correctly
    res = dfi >> group_by('z') >> do(sum_x)
    assert dfi.index.equals(res.index)
    assert dfi['x'].equals(res['x'])
    assert all(res['sum_x'] == [5, 3, 3, 5])

    # Branches
    with pytest.raises(ValueError):
        # args and kwargs
        df >> group_by('w') >> do(
            least_squares,
            slope=lambda gdf: slope(gdf.x, gdf.y),
            intercept=lambda gdf: intercept(gdf.x, gdf.y))

    with pytest.raises(TypeError):
        df >> group_by('w') >> do('len(x)')

    # Potentially problematic index
    def non_range_index_func(gdf):
        return pd.Series([11, 12, 13], index=[21, 22, 23])

    result = df >> do(r=non_range_index_func)
    assert all(result['r'] == [11, 12, 13])