Beispiel #1
0
def test_mutate_smart_nesting(dfs):
    # y and z both use x, so should create only 1 extra query
    lazy_tbl = dfs >> mutate(x=_.a + 1, y=_.x + 1, z=_.x + 1)

    query = lazy_tbl.last_op.fromclause

    assert query is lazy_tbl.ops[0]
    assert isinstance(query.fromclause, sqlalchemy.Table)
Beispiel #2
0
def test_mutate_using_agg_expr(backend):
    data = data_frame(x = range(1, 5), g = [1,1,2,2])
    dfs = backend.load_df(data)
    assert_equal_query(
            dfs,
            group_by(_.g) >> mutate(y = _.x - _.x.mean()),
            data.assign(y = [-.5, .5, -.5, .5])
            )
Beispiel #3
0
def test_raw_sql_mutate_refer_previous_raise_dberror(backend, df):
    # Note: unlikely will be able to support this case. Normally we analyze
    # the expression to know whether we need to create a subquery.
    with pytest.raises(sqlalchemy.exc.DatabaseError):
        assert_equal_query(
            df,
            group_by("x") >> mutate(z1=sql_raw("y + 1"), z2=sql_raw("z1 + 1")),
            DATA.assign(z1=lambda d: d.y + 1, z2=lambda d: d.z1 + 1))
Beispiel #4
0
def test_mutate_window_funcs(backend):
    data = data_frame(x = range(1, 5), g = [1,1,2,2])
    dfs = backend.load_df(data)
    assert_equal_query(
            dfs,
            group_by(_.g) >> mutate(row_num = row_number(_).astype(float)),
            data.assign(row_num = [1.0, 2, 1, 2])
            )
Beispiel #5
0
def test_mutate_using_cuml_agg(backend):
    data = data_frame(x=range(1, 5), g=[1, 1, 2, 2])
    dfs = backend.load_df(data)

    # cuml window without arrange before generates warning
    with pytest.warns(None):
        assert_equal_query(dfs,
                           group_by(_.g) >> mutate(y=_.x.cumsum()),
                           data.assign(y=[1.0, 3, 3, 7]))
Beispiel #6
0
def test_mutate_vector(backend, func, simple_data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_cached_df(simple_data)

    assert_equal_query(df,
                       mutate(y=func),
                       simple_data.assign(y=func),
                       check_dtype=False)

    # grouped
    assert_equal_query(
        df,
        group_by(_.g) >> mutate(y=func),
        simple_data.groupby('g').apply(lambda d: d.assign(y=func)).reset_index(
            drop=True),
        check_dtype=False)
Beispiel #7
0
def test_summarize_subquery_group_vars(backend, df):
    query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min())
    assert_equal_query(
            df,
            query,
            data_frame(g2 = ['A', 'B'], low = [1, 3])
            )

    # check that is uses a subquery, since g2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
Beispiel #8
0
def test_summarize_subquery_op_vars(backend, df):
    query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min())
    assert_equal_query(
            df,
            query,
            data_frame(g = ['a', 'b'], low = [2, 4])
            )

    # check that is uses a subquery, since x2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
Beispiel #9
0
def test_hypothesis_mutate_vector_funcs(backend, data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_df(data)

    for func in OMNIBUS_VECTOR_FUNCS:
        assert_equal_query(df,
                           mutate(y=func),
                           data.assign(y=func),
                           check_dtype=False)
def plot_large_error_percentage(data, threshold):
    """
    calculate overall high error percentage based on 2 std devations from the mean 
        abs(resid_mean - resid) > 2 * resid.std()
        
    gather long form data of the columns, filter to only large errors and their percentages
    
    setup facet grid and pass in axes plotting function
    """
    resid_mean = data.final_residuals.mean()

    # Calculate distance from average and apply a threshold to indicate large errors
    data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold

    # Percentage of large errors in the overall data set
    overall_large_error_percent = data.high_error.mean()

    # long form plotting data
    # Use siuba library to help with R/dplyr style aggregations
    data_melted = (data.loc[:, [
        'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis',
        'yearsExperience', 'high_error'
    ]].melt(id_vars='high_error', var_name='column') >> group_by(
        'column', 'value', 'high_error') >> summarize(total=_.value_counts())
                   >> group_by('column', 'value') >>
                   mutate(percent=_.total / _.total.sum()) >> ungroup())

    # Filter for only large error percentages and apply a grouped sort to prepare for plotting
    data_melted = (
        data_melted.loc[data_melted.high_error, :].groupby('column').apply(
            lambda x: x.sort_values('percent', ascending=False)).reset_index(
                drop=True))

    plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}"
    plotGrid = sns.FacetGrid(data_melted,
                             col='column',
                             sharey=False,
                             col_wrap=3,
                             height=4,
                             aspect=1.2)
    plotGrid.map(_large_error_axes_plot,
                 'percent',
                 'value',
                 error_threshold=overall_large_error_percent)
    plotGrid.set_xlabels('Percentage of large errors')
    plotGrid.set_ylabels('')
    plotGrid.add_legend(loc='upper center',
                        bbox_to_anchor=(0.5, 0),
                        fancybox=True,
                        frameon=True)
    plt.suptitle(plot_title, y=1.05)
    plt.show()
def test_frame_mutate(skip_backend, backend, entry):
    test_missing_implementation(entry, backend)

    # Prepare input data ------------------------------------------------------
    # case: inputs must be boolean
    crnt_data = get_data(entry, DATA, backend)
    df = backend.load_df(crnt_data)

    # Execute mutate ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(entry)

    # Run test for equality w/ ungrouped pandas ----
    dst = crnt_data.assign(result=call_expr(crnt_data))
    dst['result'] = cast_result_type(entry, backend, dst['result'])

    assert_equal_query(df, mutate(result=call_expr), dst)

    # Run test for equality w/ grouped pandas ----
    g_dst = crnt_data.groupby('g').apply(
        lambda d: d.assign(result=call_expr)).reset_index(drop=True)
    g_dst['result'] = cast_result_type(entry, backend, g_dst['result'])
    assert_equal_query(df, group_by(_.g) >> mutate(result=call_expr), g_dst)
Beispiel #12
0
def test_mutate_smart_nesting(backend, dfs):
    # y and z both use x, so should create only 1 extra query
    lazy_tbl = dfs >> mutate(x = _.a + 1, y = _.x + 1, z = _.x + 1)

    # should have form
    # SELECT ..., x + 1 as y, x + 1 as z FROM (
    # SELECT ..., a + 1 as x FROM
    # <TABLENAME>) some_alias
    inner_alias = lazy_tbl.last_op.froms[0]
    inner_select = inner_alias.element
    orig_table = inner_select.froms[0]

    assert orig_table is lazy_tbl.tbl
def test_pandas_grouped_frame_fast_mutate(entry):
    from siuba.experimental.pd_groups.dialect import fast_mutate, DataFrameGroupBy
    gdf = get_data(entry, DATA).groupby('g')

    # Execute mutate ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(entry)

    res = fast_mutate(gdf, result=call_expr)
    dst = mutate(gdf, result=call_expr)

    # TODO: apply mark to skip failing tests, rather than downcast
    # pandas grouped aggs, when not using cython, _try_cast back to original type
    # but since mutate uses apply, it doesn't :/. Currently only affects median func.
    dst_obj = dst.obj
    if str_expr == '_.x.median()':
        dst_obj['result'] = gdf._try_cast(dst_obj['result'], gdf.x.obj)

    assert isinstance(dst, DataFrameGroupBy)
    assert_frame_equal(res.obj, dst_obj)
Beispiel #14
0
DATA = data_frame(a=1, b=2, c=3)


@pytest.fixture(scope="module")
def dfs(backend):
    return backend.load_df(DATA)


@pytest.mark.parametrize("query, output", [
    (select(_.c), data_frame(c=3)),
    (select(_.b == _.c), data_frame(b=3)),
    (select(_["a":"c"]), data_frame(a=1, b=2, c=3)),
    (select(_[_.a:_.c]), data_frame(a=1, b=2, c=3)),
    (select(_.a, _.b) >> select(_.b), data_frame(b=2)),
    (mutate(a=_.b + _.c) >> select(_.a), data_frame(a=5)),
    pytest.param(group_by(_.a) >> select(_.b),
                 data_frame(b=2, a=1),
                 marks=pytest.mark.xfail),
])
def test_select_siu(dfs, query, output):
    assert_equal_query(dfs, query, output)


@pytest.mark.skip("TODO: #63")
def test_select_kwargs(dfs):
    assert_equal_query(dfs, select(x=_.a), data_frame(x=1))


# Rename ----------------------------------------------------------------------
Beispiel #15
0
def test_raw_sql_summarize(backend, df):
    assert_equal_query(
        df,
        summarize(z=sql_raw("SUM(y)")) >> mutate(z=_.z.astype(int)),
        data_frame(z=[3]))
Beispiel #16
0
def test_pandas_only_vector_funcs(func, data):
    res = mutate(data, y=func)
    dst = data.assign(y=func)

    assert_frame_equal(res, dst)
Beispiel #17
0
def test_summarize_after_mutate_cuml_win(backend, df_float):
    assert_equal_query(df_float,
                       mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()),
                       data_frame(z=[10.]))

# -

top250 = data_top250()
game_goals = data_game_goals()
# top250 =     pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/top_250.csv') \
#         .rename(columns = {'total_games': 'total_goals'})
# game_goals = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/game_goals.csv')

# +
# filter top 8 scorers

top8 = (
    top250 >>
    mutate(dense_rank=_.total_goals.rank(method="dense", ascending=False)) >>
    filter(_.dense_rank < 10)
    #     >> select(_.raw_rank, _.dense_rank, _.total_goals, _.player)
)

top8_games = game_goals >> inner_join(_, top8, "player")

# +
st.write("Goals by month")

st.write("Top 8 players not in our data")
top8 >> filter(_.yr_start < 1979)

# +
from pandas.tseries.offsets import MonthBegin
from siuba.experimental.pd_groups import fast_summarize
Beispiel #19
0
def test_mutate_reassign_all_cols_keeps_rowsize(dfs):
    assert_equal_query(
            dfs,
            mutate(a = 1, b = 2),
            data_frame(a = [1,1,1], b = [2,2,2])
            )
Beispiel #20
0
def test_raw_sql_mutate_grouped(backend, df):
    assert_equal_query(df,
                       group_by("x") >> mutate(z=sql_raw("y + 1")),
                       DATA.assign(z=lambda d: d.y + 1))
def test_distinct_of_mutate_col(df):
    query = mutate(z = _.x + 1) >>  distinct(_.z)

    assert_equal_query(df, query, pd.DataFrame({'z': [2,3,4,5,6]}))
Beispiel #22
0
def test_no_arrange_before_cuml_window_warning(backend):
    data = data_frame(x=range(1, 5), g=[1, 1, 2, 2])
    dfs = backend.load_df(data)
    with pytest.warns(RuntimeWarning):
        dfs >> mutate(y=_.x.cumsum())
Beispiel #23
0
    
from siuba import _, mutate, select, group_by, summarize, filter
from siuba.dply.vector import row_number

import pytest
from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql
from string import ascii_lowercase 

DATA = data_frame(a = [1,2,3], b = [9,8,7])

@pytest.fixture(scope = "module")
def dfs(backend):
    return backend.load_df(DATA)

@pytest.mark.parametrize("query, output", [
    (mutate(x = _.a + _.b), DATA.assign(x = [10, 10, 10])),
    pytest.param( mutate(x = _.a + _.b) >> summarize(ttl = _.x.sum()), data_frame(ttl = 30.0), marks = pytest.mark.skip("TODO: failing sqlite?")),
    (mutate(x = _.a + 1, y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])),
    (mutate(x = _.a + 1) >> mutate(y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])),
    (mutate(x = _.a + 1, y = _.x + 1), DATA.assign(x = [2,3,4], y = [3,4,5]))
    ])
def test_mutate_basic(dfs, query, output):
    assert_equal_query(dfs, query, output)

@pytest.mark.parametrize("query, output", [
    (mutate(x = 1), DATA.assign(x = 1)),
    (mutate(x = "a"), DATA.assign(x = "a")),
    (mutate(x = 1.2), DATA.assign(x = 1.2))
    ])
def test_mutate_literal(dfs, query, output):
    assert_equal_query(dfs, query, output)
Beispiel #24
0
import pytest
from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql
from string import ascii_lowercase

DATA = data_frame(a=[1, 2, 3], b=[9, 8, 7])


@pytest.fixture(scope="module")
def dfs(backend):
    return backend.load_df(DATA)


@pytest.mark.parametrize(
    "query, output",
    [(mutate(x=_.a + _.b), DATA.assign(x=[10, 10, 10])),
     pytest.param(mutate(x=_.a + _.b) >> summarize(ttl=_.x.sum()),
                  data_frame(ttl=30.0),
                  marks=pytest.mark.skip("TODO: failing sqlite?")),
     (mutate(x=_.a + 1, y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])),
     (mutate(x=_.a + 1) >> mutate(y=_.b - 1),
      DATA.assign(x=[2, 3, 4], y=[8, 7, 6])),
     (mutate(x=_.a + 1, y=_.x + 1), DATA.assign(x=[2, 3, 4], y=[3, 4, 5]))])
def test_mutate_basic(dfs, query, output):
    assert_equal_query(dfs, query, output)


@pytest.mark.parametrize("query, output",
                         [(mutate(x=1), DATA.assign(x=1)),
                          (mutate(x="a"), DATA.assign(x="a")),
                          (mutate(x=1.2), DATA.assign(x=1.2))])
Beispiel #25
0
def test_select_mutate_filter(dfs):
    assert_equal_query(
        dfs,
        select(_.x == _.a) >> mutate(y=_.x * 2) >> filter(_.y == 2),
        data_frame(x=1, y=2))
Beispiel #26
0
def test_mutate_reassign_column_ordering(dfs):
    assert_equal_query(
            dfs,
            mutate(c = 3, a = 1),
            data_frame(a = [1,1,1], b = [9,8,7], c = [3,3,3])
            )
Beispiel #27
0
def test_mutate_reassign_column_ordering(dfs):
    assert_equal_query(dfs, mutate(c=3, a=1, b=2), data_frame(a=1, b=2, c=3))
Beispiel #28
0
    return [dst[k] for k in src] != list(src.values())


# -

air_methods = Airtable("app11UN7CECnqwlGY", "tblWsICYtRPlLYJak",
                       os.environ["AIRTABLE_API_KEY"])
air_backends = Airtable("app11UN7CECnqwlGY", "tblbmJgafdxiJLZz4",
                        os.environ["AIRTABLE_API_KEY"])

# +

raw_methods = air_methods.get_all()

tbl_methods = (pd.DataFrame(raw_methods) >> mutate(
    fields=_.fields.apply(pd.DataFrame, index=[0])) >> unnest("fields"))

# +
raw_spec = pd.json_normalize([{
    "method": k,
    **v
} for k, v in series.spec.items()])

spec_methods = (raw_spec.siu_select(
    -_.startswith('backend'), -_.expr_frame,
    -_.expr_series).rename(columns=lambda s: s.replace('.', '_')))

joined_methods = (spec_methods >> left_join(
    _, select(tbl_methods, _.id, _.method), ['method']))

methods = joined_methods >> pipe(_.to_dict(orient="records"))
Beispiel #29
0
def test_mutate_overwrites_prev(backend):
    # TODO: check that query doesn't generate a CTE
    dfs = backend.load_df(data_frame(x=range(1, 5), g=[1, 1, 2, 2]))
    assert_equal_query(dfs,
                       mutate(x=_.x + 1) >> mutate(x=_.x + 1),
                       data_frame(x=[3, 4, 5, 6], g=[1, 1, 2, 2]))
Beispiel #30
0
def test_raw_sql_mutate(backend, df):
    assert_equal_query(df, mutate(z=sql_raw("y + 1")),
                       DATA.assign(z=lambda d: d.y + 1))