Ejemplo n.º 1
0
def test_Q():
    df = pd.DataFrame({'var.name': [1, 2, 3], 'class': [1, 2, 3]})

    with pytest.raises(NameError):
        df >> define(y='var.name')

    with pytest.raises(NameError):
        df >> create(y='var.name')

    with pytest.raises(SyntaxError):
        df >> define(y='class+1')

    with pytest.raises(SyntaxError):
        df >> create(y='class+1')

    with pytest.raises(SyntaxError):
        df >> arrange('class+1')

    df >> define(y='Q("var.name")')
    df >> create(y='Q("var.name")')
    df >> define(y='Q("class")')
    df >> create(y='Q("class")')
    df >> define(y='class')
    df >> create(y='class')
    df >> arrange('class')
    df >> arrange('Q("class")+1')
Ejemplo n.º 2
0
def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
Ejemplo n.º 3
0
def test_arrange():
    # Index                  0, 1, 2, 3, 4, 5
    df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0], 'y': [1, 2, 3, 4, 5, 6]})
    I = pd.Index

    result = df >> arrange('x')
    assert result.index.equals(I([5, 0, 2, 3, 4, 1]))

    result = df >> arrange('x', '-y')
    assert result.index.equals(I([5, 0, 3, 2, 4, 1]))

    result = df >> arrange('np.sin(y)')
    assert result.index.equals(I([4, 3, 5, 2, 0, 1]))

    # Branches
    result = df >> arrange()
    assert result is df

    result = df >> arrange('x') >> arrange('y')  # already sorted
    assert result.index.equals(df.index)

    # Bad index
    df_bad = df.copy()
    df_bad.index = [0, 1, 0, 1, 0, 1]
    result = df_bad >> arrange('x')
    assert result.index.equals(I([1, 0, 0, 1, 0, 1]))

    result = df_bad >> arrange('x', '-y')
    assert result.index.equals(I([1, 0, 1, 0, 0, 1]))
Ejemplo n.º 4
0
def summarize_fd_by_subject(df):

    return (
        df
        >> p.group_by("subject_id", "condition", "data_id", "headcase")
        >> p.summarize(
            fd_mean="mean(FramewiseDisplacement)",
            fd_median="median(FramewiseDisplacement)",
            fd_mean_filter="filter_mean(FramewiseDisplacement)",
            fd_median_filter="filter_median(FramewiseDisplacement)",
            perc_spikes="perc_high_motion(FramewiseDisplacement)",
        )
        >> p.do(
            lambda df: df.melt(
                id_vars=["subject_id", "data_id", "condition", "headcase"],
                value_vars=[
                    "fd_mean",
                    "fd_median",
                    "fd_mean_filter",
                    "fd_median_filter",
                    "perc_spikes",
                ],
                var_name="measure",
                value_name="val",
            )
        )
        >> p.arrange("subject_id")
        >> p.call(".reset_index", drop=True)
    )
Ejemplo n.º 5
0
def test_arrange():
    # Index                  0, 1, 2, 3, 4, 5
    df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0], 'y': [1, 2, 3, 4, 5, 6]})
    I = pd.Index

    result = df >> arrange('x')
    assert all(result.x == [0, 1, 2, 2, 4, 5])
    assert all(result.y == [6, 1, 3, 4, 5, 2])

    result = df >> arrange('x', '-y')
    assert all(result.x == [0, 1, 2, 2, 4, 5])
    assert all(result.y == [6, 1, 4, 3, 5, 2])

    result = df >> arrange('np.sin(y)')
    assert all(result.x == [4, 2, 0, 2, 1, 5])
    assert all(result.y == [5, 4, 6, 3, 1, 2])

    # Branches
    result = df >> arrange()
    assert result is df

    result = df >> arrange('x') >> arrange('y')  # already sorted
    assert result.index.equals(df.index)

    # Do not reset index
    result = df >> arrange('x', reset_index=False)
    assert result.index.equals(I([5, 0, 2, 3, 4, 1]))

    # Bad index
    df_bad = df.copy()
    df_bad.index = [0, 1, 0, 1, 0, 1]
    result = df_bad >> arrange('x')
    assert all(result.x == [0, 1, 2, 2, 4, 5])

    result = df_bad >> arrange('x', '-y')
    assert all(result.x == [0, 1, 2, 2, 4, 5])
    assert all(result.y == [6, 1, 4, 3, 5, 2])

    # A computation on a non-increasing index
    df2 = pd.DataFrame({
        'x': [0, 1, 2, 2, 4, 5],
        'y': [6, 1, 3, 4, 5, 2]
    },
                       index=[5, 0, 2, 3, 4, 1])
    result = df2 >> arrange('-y')
    assert all(result.y == [6, 5, 4, 3, 2, 1])
Ejemplo n.º 6
0
def summarize_mpars_by_subject(df):
    return (
        df
        >> p.group_by("subject_id", "condition", "data_id", "headcase")
        >> p.summarize(
            x_mean="mean(x)",
            x_median="median(x)",
            x_std="std(x)",
            y_mean="mean(y)",
            y_median="median(y)",
            y_std="std(y)",
            z_mean="mean(z)",
            z_median="median(z)",
            z_std="std(z)",
            pitch_mean="mean(pitch)",
            pitch_median="median(pitch)",
            pitch_std="std(pitch)",
            roll_mean="mean(roll)",
            roll_median="median(roll)",
            roll_std="std(roll)",
            yaw_mean="mean(yaw)",
            yaw_median="median(yaw)",
            yaw_std="std(yaw)",
        )
        >> p.call(
            ".melt",
            id_vars=["subject_id", "data_id", "condition", "headcase"],
            value_vars=[
                "x_mean",
                "y_mean",
                "z_mean",
                "x_median",
                "y_median",
                "z_median",
                "x_std",
                "y_std",
                "z_std",
                "pitch_mean",
                "roll_mean",
                "yaw_mean",
                "pitch_median",
                "roll_median",
                "yaw_median",
                "pitch_std",
                "roll_std",
                "yaw_std",
            ],
            var_name="measure",
            value_name="val",
        )
        >> p.arrange("subject_id")
        >> p.call(".reset_index", drop=True)
    )
Ejemplo n.º 7
0
def test_data_mutability():
    # These tests affirm that we know the consequences of the verbs.
    # A test in the Mutable section should not fail without a change
    # in implementation. That change should be triggered when Pandas
    # implements a consistent copy-on-write policy.
    #
    # When a test in the mutable section fails, it is bad news. The
    # should be no memory usage gains by reusing the original data,
    # except for the case of `rename`.
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    # Default to not mutable
    df >> define(z='x**2')
    assert 'z' not in df

    df >> group_by(z='x**2')
    assert 'z' not in df

    arr = df >> pull('x')
    arr[0] = 99
    assert df.loc[0, 'x'] != 99

    df2 = df >> slice_rows(3)
    df2.loc[0, 'x'] = 999
    assert df.loc[0, 'x'] != 999

    set_option('modify_input_data', True)

    df2 = df.copy()
    df2 >> define(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    df2 >> group_by(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    arr = df2 >> pull('x')
    arr[0] = 99
    assert df2.loc[0, 'x'] == 99

    # Not mutable
    df2 = df.copy()
    df2 >> create(z='x**2')
    assert 'z' not in df2

    df2 >> sample_n(3) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> sample_frac(.5) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x', 'y') >> define(z='x**2')
    assert 'z' not in df2

    # dataframe.rename has copy-on-write (if copy=False) that affects
    # only the new frame. This creates possibility for "action at a
    # distance" effects on the new frame when the original is modified
    result = df2 >> rename(x='z')
    df2['y'] = 3
    result['x'] = 4
    assert 'z' not in df2
    assert df2.loc[0, 'y'] != 4
    assert result.loc[0, 'x'] != 3
    assert result is df2

    df2 >> arrange('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> query('x%2') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> group_indices(z='x%2')
    assert 'z' not in df2

    set_option('modify_input_data', False)
Ejemplo n.º 8
0
 def test_arrange(self):
     result = self.df >> define(z='np.sin(x)') >> arrange('z')
     assert isinstance(result, GroupedDataFrame)