def test_mutate_smart_nesting(dfs): # y and z both use x, so should create only 1 extra query lazy_tbl = dfs >> mutate(x=_.a + 1, y=_.x + 1, z=_.x + 1) query = lazy_tbl.last_op.fromclause assert query is lazy_tbl.ops[0] assert isinstance(query.fromclause, sqlalchemy.Table)
def test_mutate_using_agg_expr(backend): data = data_frame(x = range(1, 5), g = [1,1,2,2]) dfs = backend.load_df(data) assert_equal_query( dfs, group_by(_.g) >> mutate(y = _.x - _.x.mean()), data.assign(y = [-.5, .5, -.5, .5]) )
def test_raw_sql_mutate_refer_previous_raise_dberror(backend, df): # Note: unlikely will be able to support this case. Normally we analyze # the expression to know whether we need to create a subquery. with pytest.raises(sqlalchemy.exc.DatabaseError): assert_equal_query( df, group_by("x") >> mutate(z1=sql_raw("y + 1"), z2=sql_raw("z1 + 1")), DATA.assign(z1=lambda d: d.y + 1, z2=lambda d: d.z1 + 1))
def test_mutate_window_funcs(backend): data = data_frame(x = range(1, 5), g = [1,1,2,2]) dfs = backend.load_df(data) assert_equal_query( dfs, group_by(_.g) >> mutate(row_num = row_number(_).astype(float)), data.assign(row_num = [1.0, 2, 1, 2]) )
def test_mutate_using_cuml_agg(backend): data = data_frame(x=range(1, 5), g=[1, 1, 2, 2]) dfs = backend.load_df(data) # cuml window without arrange before generates warning with pytest.warns(None): assert_equal_query(dfs, group_by(_.g) >> mutate(y=_.x.cumsum()), data.assign(y=[1.0, 3, 3, 7]))
def test_mutate_vector(backend, func, simple_data): if backend.name == 'sqlite': pytest.skip() df = backend.load_cached_df(simple_data) assert_equal_query(df, mutate(y=func), simple_data.assign(y=func), check_dtype=False) # grouped assert_equal_query( df, group_by(_.g) >> mutate(y=func), simple_data.groupby('g').apply(lambda d: d.assign(y=func)).reset_index( drop=True), check_dtype=False)
def test_summarize_subquery_group_vars(backend, df): query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min()) assert_equal_query( df, query, data_frame(g2 = ['A', 'B'], low = [1, 3]) ) # check that is uses a subquery, since g2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def test_summarize_subquery_op_vars(backend, df): query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min()) assert_equal_query( df, query, data_frame(g = ['a', 'b'], low = [2, 4]) ) # check that is uses a subquery, since x2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def test_hypothesis_mutate_vector_funcs(backend, data): if backend.name == 'sqlite': pytest.skip() df = backend.load_df(data) for func in OMNIBUS_VECTOR_FUNCS: assert_equal_query(df, mutate(y=func), data.assign(y=func), check_dtype=False)
def plot_large_error_percentage(data, threshold): """ calculate overall high error percentage based on 2 std devations from the mean abs(resid_mean - resid) > 2 * resid.std() gather long form data of the columns, filter to only large errors and their percentages setup facet grid and pass in axes plotting function """ resid_mean = data.final_residuals.mean() # Calculate distance from average and apply a threshold to indicate large errors data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold # Percentage of large errors in the overall data set overall_large_error_percent = data.high_error.mean() # long form plotting data # Use siuba library to help with R/dplyr style aggregations data_melted = (data.loc[:, [ 'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis', 'yearsExperience', 'high_error' ]].melt(id_vars='high_error', var_name='column') >> group_by( 'column', 'value', 'high_error') >> summarize(total=_.value_counts()) >> group_by('column', 'value') >> mutate(percent=_.total / _.total.sum()) >> ungroup()) # Filter for only large error percentages and apply a grouped sort to prepare for plotting data_melted = ( data_melted.loc[data_melted.high_error, :].groupby('column').apply( lambda x: x.sort_values('percent', ascending=False)).reset_index( drop=True)) plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}" plotGrid = sns.FacetGrid(data_melted, col='column', sharey=False, col_wrap=3, height=4, aspect=1.2) plotGrid.map(_large_error_axes_plot, 'percent', 'value', error_threshold=overall_large_error_percent) plotGrid.set_xlabels('Percentage of large errors') plotGrid.set_ylabels('') plotGrid.add_legend(loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, frameon=True) plt.suptitle(plot_title, y=1.05) plt.show()
def test_frame_mutate(skip_backend, backend, entry): test_missing_implementation(entry, backend) # Prepare input data ------------------------------------------------------ # case: inputs must be boolean crnt_data = get_data(entry, DATA, backend) df = backend.load_df(crnt_data) # Execute mutate ---------------------------------------------------------- str_expr, call_expr = get_df_expr(entry) # Run test for equality w/ ungrouped pandas ---- dst = crnt_data.assign(result=call_expr(crnt_data)) dst['result'] = cast_result_type(entry, backend, dst['result']) assert_equal_query(df, mutate(result=call_expr), dst) # Run test for equality w/ grouped pandas ---- g_dst = crnt_data.groupby('g').apply( lambda d: d.assign(result=call_expr)).reset_index(drop=True) g_dst['result'] = cast_result_type(entry, backend, g_dst['result']) assert_equal_query(df, group_by(_.g) >> mutate(result=call_expr), g_dst)
def test_mutate_smart_nesting(backend, dfs): # y and z both use x, so should create only 1 extra query lazy_tbl = dfs >> mutate(x = _.a + 1, y = _.x + 1, z = _.x + 1) # should have form # SELECT ..., x + 1 as y, x + 1 as z FROM ( # SELECT ..., a + 1 as x FROM # <TABLENAME>) some_alias inner_alias = lazy_tbl.last_op.froms[0] inner_select = inner_alias.element orig_table = inner_select.froms[0] assert orig_table is lazy_tbl.tbl
def test_pandas_grouped_frame_fast_mutate(entry): from siuba.experimental.pd_groups.dialect import fast_mutate, DataFrameGroupBy gdf = get_data(entry, DATA).groupby('g') # Execute mutate ---------------------------------------------------------- str_expr, call_expr = get_df_expr(entry) res = fast_mutate(gdf, result=call_expr) dst = mutate(gdf, result=call_expr) # TODO: apply mark to skip failing tests, rather than downcast # pandas grouped aggs, when not using cython, _try_cast back to original type # but since mutate uses apply, it doesn't :/. Currently only affects median func. dst_obj = dst.obj if str_expr == '_.x.median()': dst_obj['result'] = gdf._try_cast(dst_obj['result'], gdf.x.obj) assert isinstance(dst, DataFrameGroupBy) assert_frame_equal(res.obj, dst_obj)
DATA = data_frame(a=1, b=2, c=3) @pytest.fixture(scope="module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize("query, output", [ (select(_.c), data_frame(c=3)), (select(_.b == _.c), data_frame(b=3)), (select(_["a":"c"]), data_frame(a=1, b=2, c=3)), (select(_[_.a:_.c]), data_frame(a=1, b=2, c=3)), (select(_.a, _.b) >> select(_.b), data_frame(b=2)), (mutate(a=_.b + _.c) >> select(_.a), data_frame(a=5)), pytest.param(group_by(_.a) >> select(_.b), data_frame(b=2, a=1), marks=pytest.mark.xfail), ]) def test_select_siu(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.skip("TODO: #63") def test_select_kwargs(dfs): assert_equal_query(dfs, select(x=_.a), data_frame(x=1)) # Rename ----------------------------------------------------------------------
def test_raw_sql_summarize(backend, df): assert_equal_query( df, summarize(z=sql_raw("SUM(y)")) >> mutate(z=_.z.astype(int)), data_frame(z=[3]))
def test_pandas_only_vector_funcs(func, data): res = mutate(data, y=func) dst = data.assign(y=func) assert_frame_equal(res, dst)
def test_summarize_after_mutate_cuml_win(backend, df_float): assert_equal_query(df_float, mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()), data_frame(z=[10.]))
# - top250 = data_top250() game_goals = data_game_goals() # top250 = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/top_250.csv') \ # .rename(columns = {'total_games': 'total_goals'}) # game_goals = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/game_goals.csv') # + # filter top 8 scorers top8 = ( top250 >> mutate(dense_rank=_.total_goals.rank(method="dense", ascending=False)) >> filter(_.dense_rank < 10) # >> select(_.raw_rank, _.dense_rank, _.total_goals, _.player) ) top8_games = game_goals >> inner_join(_, top8, "player") # + st.write("Goals by month") st.write("Top 8 players not in our data") top8 >> filter(_.yr_start < 1979) # + from pandas.tseries.offsets import MonthBegin from siuba.experimental.pd_groups import fast_summarize
def test_mutate_reassign_all_cols_keeps_rowsize(dfs): assert_equal_query( dfs, mutate(a = 1, b = 2), data_frame(a = [1,1,1], b = [2,2,2]) )
def test_raw_sql_mutate_grouped(backend, df): assert_equal_query(df, group_by("x") >> mutate(z=sql_raw("y + 1")), DATA.assign(z=lambda d: d.y + 1))
def test_distinct_of_mutate_col(df): query = mutate(z = _.x + 1) >> distinct(_.z) assert_equal_query(df, query, pd.DataFrame({'z': [2,3,4,5,6]}))
def test_no_arrange_before_cuml_window_warning(backend): data = data_frame(x=range(1, 5), g=[1, 1, 2, 2]) dfs = backend.load_df(data) with pytest.warns(RuntimeWarning): dfs >> mutate(y=_.x.cumsum())
from siuba import _, mutate, select, group_by, summarize, filter from siuba.dply.vector import row_number import pytest from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql from string import ascii_lowercase DATA = data_frame(a = [1,2,3], b = [9,8,7]) @pytest.fixture(scope = "module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize("query, output", [ (mutate(x = _.a + _.b), DATA.assign(x = [10, 10, 10])), pytest.param( mutate(x = _.a + _.b) >> summarize(ttl = _.x.sum()), data_frame(ttl = 30.0), marks = pytest.mark.skip("TODO: failing sqlite?")), (mutate(x = _.a + 1, y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])), (mutate(x = _.a + 1) >> mutate(y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])), (mutate(x = _.a + 1, y = _.x + 1), DATA.assign(x = [2,3,4], y = [3,4,5])) ]) def test_mutate_basic(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.parametrize("query, output", [ (mutate(x = 1), DATA.assign(x = 1)), (mutate(x = "a"), DATA.assign(x = "a")), (mutate(x = 1.2), DATA.assign(x = 1.2)) ]) def test_mutate_literal(dfs, query, output): assert_equal_query(dfs, query, output)
import pytest from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql from string import ascii_lowercase DATA = data_frame(a=[1, 2, 3], b=[9, 8, 7]) @pytest.fixture(scope="module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize( "query, output", [(mutate(x=_.a + _.b), DATA.assign(x=[10, 10, 10])), pytest.param(mutate(x=_.a + _.b) >> summarize(ttl=_.x.sum()), data_frame(ttl=30.0), marks=pytest.mark.skip("TODO: failing sqlite?")), (mutate(x=_.a + 1, y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])), (mutate(x=_.a + 1) >> mutate(y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])), (mutate(x=_.a + 1, y=_.x + 1), DATA.assign(x=[2, 3, 4], y=[3, 4, 5]))]) def test_mutate_basic(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.parametrize("query, output", [(mutate(x=1), DATA.assign(x=1)), (mutate(x="a"), DATA.assign(x="a")), (mutate(x=1.2), DATA.assign(x=1.2))])
def test_select_mutate_filter(dfs): assert_equal_query( dfs, select(_.x == _.a) >> mutate(y=_.x * 2) >> filter(_.y == 2), data_frame(x=1, y=2))
def test_mutate_reassign_column_ordering(dfs): assert_equal_query( dfs, mutate(c = 3, a = 1), data_frame(a = [1,1,1], b = [9,8,7], c = [3,3,3]) )
def test_mutate_reassign_column_ordering(dfs): assert_equal_query(dfs, mutate(c=3, a=1, b=2), data_frame(a=1, b=2, c=3))
return [dst[k] for k in src] != list(src.values()) # - air_methods = Airtable("app11UN7CECnqwlGY", "tblWsICYtRPlLYJak", os.environ["AIRTABLE_API_KEY"]) air_backends = Airtable("app11UN7CECnqwlGY", "tblbmJgafdxiJLZz4", os.environ["AIRTABLE_API_KEY"]) # + raw_methods = air_methods.get_all() tbl_methods = (pd.DataFrame(raw_methods) >> mutate( fields=_.fields.apply(pd.DataFrame, index=[0])) >> unnest("fields")) # + raw_spec = pd.json_normalize([{ "method": k, **v } for k, v in series.spec.items()]) spec_methods = (raw_spec.siu_select( -_.startswith('backend'), -_.expr_frame, -_.expr_series).rename(columns=lambda s: s.replace('.', '_'))) joined_methods = (spec_methods >> left_join( _, select(tbl_methods, _.id, _.method), ['method'])) methods = joined_methods >> pipe(_.to_dict(orient="records"))
def test_mutate_overwrites_prev(backend): # TODO: check that query doesn't generate a CTE dfs = backend.load_df(data_frame(x=range(1, 5), g=[1, 1, 2, 2])) assert_equal_query(dfs, mutate(x=_.x + 1) >> mutate(x=_.x + 1), data_frame(x=[3, 4, 5, 6], g=[1, 1, 2, 2]))
def test_raw_sql_mutate(backend, df): assert_equal_query(df, mutate(z=sql_raw("y + 1")), DATA.assign(z=lambda d: d.y + 1))