def test_frame_summarize(skip_backend, backend, agg_entry): entry = agg_entry test_missing_implementation(entry, backend) # Prepare input data ------------------------------------------------------ # case: inputs must be boolean crnt_data = get_data(entry, DATA, backend) df = backend.load_df(crnt_data) # Execute mutate ---------------------------------------------------------- str_expr, call_expr = get_df_expr(entry) dst = data_frame(result=call_expr(crnt_data)) # Process output ---------------------------------------------------------- # case: output is of a different type than w/ pandas dst['result'] = cast_result_type(entry, backend, dst['result']) # Run test for equality w/ pandas ---- # otherwise, verify returns same result as mutate assert_equal_query(df, summarize(result=call_expr), dst) dst_g = crnt_data.groupby('g').apply(call_expr).reset_index().rename( columns={0: 'result'}) assert_equal_query(df, group_by(_.g) >> summarize(result=call_expr), dst_g)
def test_summarize_removes_1_grouping(backend): data = data_frame(a=1, b=2, c=3) df = backend.load_df(data) q1 = df >> group_by(_.a, _.b) >> summarize(n=n(_)) assert q1.group_by == ("a") q2 = q1 >> summarize(n=n(_)) assert not len(q2.group_by)
def test_agg_vector(backend, func, simple_data): if backend.name == 'sqlite': pytest.skip() df = backend.load_cached_df(simple_data) res = data_frame(y=func(simple_data)) assert_equal_query(df, summarize(y=func), res) # grouped assert_equal_query( df, group_by(_.g) >> summarize(y=func), simple_data.groupby('g').apply(func).reset_index().rename( columns={0: 'y'}))
def test_summarize_removes_series_index(): # Note: currently wouldn't work in postgresql, since _.x + _.y not an agg func df = data_frame(g=['a', 'b', 'c'], x=[1, 2, 3], y=[4, 5, 6]) assert_equal_query(df, group_by(_.g) >> summarize(res=_.x + _.y), df.assign(res=df.x + df.y).drop(columns=["x", "y"]))
def test_frame_set_aggregates_postgresql(): # TODO: probably shouldn't be creating backend here backend = SqlBackend("postgresql") dfs = backend.load_df(data[None]) expr = _.x.quantile(.75) assert_equal_query(dfs, group_by(_.g) >> summarize(result=expr), data_frame(g=['a', 'b'], result=[11., 13.]))
def test_summarize_subquery_op_vars(backend, df): query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min()) assert_equal_query( df, query, data_frame(g = ['a', 'b'], low = [2, 4]) ) # check that is uses a subquery, since x2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def test_summarize_subquery_group_vars(backend, df): query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min()) assert_equal_query( df, query, data_frame(g2 = ['A', 'B'], low = [1, 3]) ) # check that is uses a subquery, since g2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def plot_large_error_percentage(data, threshold): """ calculate overall high error percentage based on 2 std devations from the mean abs(resid_mean - resid) > 2 * resid.std() gather long form data of the columns, filter to only large errors and their percentages setup facet grid and pass in axes plotting function """ resid_mean = data.final_residuals.mean() # Calculate distance from average and apply a threshold to indicate large errors data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold # Percentage of large errors in the overall data set overall_large_error_percent = data.high_error.mean() # long form plotting data # Use siuba library to help with R/dplyr style aggregations data_melted = (data.loc[:, [ 'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis', 'yearsExperience', 'high_error' ]].melt(id_vars='high_error', var_name='column') >> group_by( 'column', 'value', 'high_error') >> summarize(total=_.value_counts()) >> group_by('column', 'value') >> mutate(percent=_.total / _.total.sum()) >> ungroup()) # Filter for only large error percentages and apply a grouped sort to prepare for plotting data_melted = ( data_melted.loc[data_melted.high_error, :].groupby('column').apply( lambda x: x.sort_values('percent', ascending=False)).reset_index( drop=True)) plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}" plotGrid = sns.FacetGrid(data_melted, col='column', sharey=False, col_wrap=3, height=4, aspect=1.2) plotGrid.map(_large_error_axes_plot, 'percent', 'value', error_threshold=overall_large_error_percent) plotGrid.set_xlabels('Percentage of large errors') plotGrid.set_ylabels('') plotGrid.add_legend(loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, frameon=True) plt.suptitle(plot_title, y=1.05) plt.show()
def test_pandas_grouped_frame_fast_summarize(agg_entry): from siuba.experimental.pd_groups.dialect import fast_summarize, DataFrameGroupBy gdf = get_data(agg_entry, DATA).groupby('g') # Execute summarize ---------------------------------------------------------- str_expr, call_expr = get_df_expr(agg_entry) res = fast_summarize(gdf, result=call_expr) dst = summarize(gdf, result=call_expr) # TODO: apply mark to skip failing tests, rather than downcast # pandas grouped aggs, when not using cython, _try_cast back to original type # but since summarize uses apply, it doesn't :/. Currently only affects median func. if str_expr == '_.x.median()': dst['result'] = gdf._try_cast(dst['result'], gdf.x.obj) assert_frame_equal(res, dst)
def test_ungrouped_summarize_literal(df, query, output): assert_equal_query(df, summarize(y=1), data_frame(y=1))
def test_frame_mode_returns_many(): with pytest.raises(ValueError): df = data_frame(x=[1, 2, 3]) res = summarize(df, result=_.x.mode())
def df(backend): return backend.load_df(DATA) @pytest.fixture(scope="module") def df_float(backend): return backend.load_df(DATA.assign(x=lambda d: d.x.astype(float))) @pytest.fixture(scope="module") def gdf(df): return df >> group_by(_.g) @pytest.mark.parametrize("query, output", [ (summarize(y=n(_)), data_frame(y=4)), (summarize(y=_.x.min()), data_frame(y=1)), ]) def test_summarize_ungrouped(df, query, output): assert_equal_query(df, query, output) @pytest.mark.skip("TODO: should return 1 row (#63)") def test_ungrouped_summarize_literal(df, query, output): assert_equal_query(df, summarize(y=1), data_frame(y=1)) @backend_notimpl("sqlite") def test_summarize_after_mutate_cuml_win(backend, df_float): assert_equal_query(df_float, mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()),
def test_summarize_removes_order_vars(backend, df): lazy_tbl = df >> summarize(n=n(_)) assert not len(lazy_tbl.order_by)
from siuba import _, mutate, select, group_by, summarize, filter from siuba.dply.vector import row_number import pytest from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql from string import ascii_lowercase DATA = data_frame(a = [1,2,3], b = [9,8,7]) @pytest.fixture(scope = "module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize("query, output", [ (mutate(x = _.a + _.b), DATA.assign(x = [10, 10, 10])), pytest.param( mutate(x = _.a + _.b) >> summarize(ttl = _.x.sum()), data_frame(ttl = 30.0), marks = pytest.mark.skip("TODO: failing sqlite?")), (mutate(x = _.a + 1, y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])), (mutate(x = _.a + 1) >> mutate(y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])), (mutate(x = _.a + 1, y = _.x + 1), DATA.assign(x = [2,3,4], y = [3,4,5])) ]) def test_mutate_basic(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.parametrize("query, output", [ (mutate(x = 1), DATA.assign(x = 1)), (mutate(x = "a"), DATA.assign(x = "a")), (mutate(x = 1.2), DATA.assign(x = 1.2)) ]) def test_mutate_literal(dfs, query, output): assert_equal_query(dfs, query, output)
def test_distinct_after_summarize(df): query = group_by(g = _.x) >> summarize(z = (_.y - _.y).min()) >> distinct(_.z) assert_equal_query(df, query, pd.DataFrame({'z': [0]}))
def test_summarize_after_mutate_cuml_win(backend, df_float): assert_equal_query(df_float, mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()), data_frame(z=[10.]))
def test_summarize_keeps_group_vars(backend, gdf): q = gdf >> summarize(n=n(_)) assert list(q.last_op.c.keys()) == ["g", "n"]
def test_summarize_validates_length(): with pytest.raises(ValueError): summarize(data_frame(x=[1, 2]), res=_.x + 1)
def test_summarize_no_same_call_var_refs(backend, df): with pytest.raises(NotImplementedError): df >> summarize(y=_.x.min(), z=_.y + 1)
def test_frame_mode_returns_many(): # related to length validation above with pytest.raises(ValueError): df = data_frame(x=[1, 2, 3]) res = summarize(df, result=_.x.mode())
def test_summarize_unnamed_args(df): assert_equal_query(df, summarize(n(_)), pd.DataFrame({'n(_)': 4}))
def test_group_by_performs_mutate(df): assert_equal_query(df, group_by(z=_.x + _.y) >> summarize(n=n(_)), data_frame(z=10, n=3))
import pytest from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql from string import ascii_lowercase DATA = data_frame(a=[1, 2, 3], b=[9, 8, 7]) @pytest.fixture(scope="module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize( "query, output", [(mutate(x=_.a + _.b), DATA.assign(x=[10, 10, 10])), pytest.param(mutate(x=_.a + _.b) >> summarize(ttl=_.x.sum()), data_frame(ttl=30.0), marks=pytest.mark.skip("TODO: failing sqlite?")), (mutate(x=_.a + 1, y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])), (mutate(x=_.a + 1) >> mutate(y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])), (mutate(x=_.a + 1, y=_.x + 1), DATA.assign(x=[2, 3, 4], y=[3, 4, 5]))]) def test_mutate_basic(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.parametrize("query, output", [(mutate(x=1), DATA.assign(x=1)), (mutate(x="a"), DATA.assign(x="a")), (mutate(x=1.2), DATA.assign(x=1.2))]) def test_mutate_literal(dfs, query, output):
def test_raw_sql_summarize(backend, df): assert_equal_query( df, summarize(z=sql_raw("SUM(y)")) >> mutate(z=_.z.astype(int)), data_frame(z=[3]))