def test_udaf_window(): @udf.reduction([dt.double], dt.double) def my_mean(series): return series.mean() df = pd.DataFrame( { 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), } ) con = ibis.pandas.connect({'df': df}) t = con.table('df') window = ibis.trailing_window(2, order_by='a', group_by='key') expr = t.mutate(rolled=my_mean(t.b).over(window)) result = expr.execute().sort_values(['key', 'a']) expected = df.sort_values(['key', 'a']).assign( rolled=lambda df: df.groupby('key') .b.rolling(2) .mean() .reset_index(level=0, drop=True) ) tm.assert_frame_equal(result, expected)
def test_array_collect_rolling_partitioned(t, df): window = ibis.trailing_window(2, order_by=t.plain_int64) colexpr = t.plain_float64.collect().over(window) expr = t['dup_strings', 'plain_int64', colexpr.name('collected')] result = expr.execute() expected = pd.DataFrame( { 'dup_strings': ['d', 'a', 'd'], 'plain_int64': [1, 2, 3], 'collected': [[4.0], [4.0, 5.0], [5.0, 6.0]], } )[expr.columns] tm.assert_frame_equal(result, expected)
def test_interactive_repr_call_failure(self): t = self.con.table("tpch_lineitem").limit(100000) t = t[t, t.l_receiptdate.cast("timestamp").name("date")] keys = [t.date.year().name("year"), "l_linestatus"] filt = t.l_linestatus.isin(["F"]) expr = t[filt].group_by(keys).aggregate(t.l_extendedprice.mean().name("avg_px")) w2 = ibis.trailing_window(9, group_by=expr.l_linestatus, order_by=expr.year) metric = expr["avg_px"].mean().over(w2) enriched = expr[expr, metric] with config.option_context("interactive", True): repr(enriched)
def test_window_frame_specs(self): t = self.con.table('alltypes') ex_template = """\ SELECT sum(d) OVER (ORDER BY f {0}) AS `foo` FROM alltypes""" cases = [ (window(preceding=0), 'range between current row and unbounded following'), (window(following=0), 'range between unbounded preceding and current row'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(preceding=5, following=0), 'rows between 5 preceding and current row'), (window(preceding=5, following=2), 'rows between 5 preceding and 2 following'), (window(following=2), 'rows between unbounded preceding and 2 following'), (window(following=2, preceding=0), 'rows between current row and 2 following'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(following=[5, 10]), 'rows between 5 following and 10 following'), (window(preceding=[10, 5]), 'rows between 10 preceding and 5 preceding'), # # cumulative windows (ibis.cumulative_window(), 'range between unbounded preceding and current row'), # # trailing windows (ibis.trailing_window(10), 'rows between 10 preceding and current row'), ] for w, frame in cases: w2 = w.order_by(t.f) expr = t.projection([t.d.sum().over(w2).name('foo')]) expected = ex_template.format(frame.upper()) self._check_sql(expr, expected)
def test_interactive_repr_call_failure(self): t = self.con.table('tpch_lineitem').limit(100000) t = t[t, t.l_receiptdate.cast('timestamp').name('date')] keys = [t.date.year().name('year'), 'l_linestatus'] filt = t.l_linestatus.isin(['F']) expr = (t[filt] .group_by(keys) .aggregate(t.l_extendedprice.mean().name('avg_px'))) w2 = ibis.trailing_window(9, group_by=expr.l_linestatus, order_by=expr.year) metric = expr['avg_px'].mean().over(w2) enriched = expr[expr, metric] with config.option_context('interactive', True): repr(enriched)
def test_batting_rolling(batting, batting_df, sort_kind): expr = batting.mutate( more_values=lambda t: t.G.sum().over( ibis.trailing_window(5, order_by=t.yearID) ) ) result = expr.execute() columns = ['G', 'yearID'] more_values = ( batting_df[columns] .sort_values('yearID', kind=sort_kind) .G.rolling(5) .sum() ) expected = batting_df.assign(more_values=more_values) tm.assert_frame_equal(result[expected.columns], expected)
def test_window_with_preceding_expr(): index = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(index)) df = pd.DataFrame({'value': data, 'time': index}, index=index) client = ibis.pandas.connect({'df': df}) t = client.table('df') expected = ( df.set_index('time') .value.rolling('3d', closed='both') .mean() .reset_index(drop=True) ) expected.index.name = None day = ibis.interval(days=1) window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
def test_batting_rolling_partitioned(batting, batting_df, sort_kind): t = batting group_by = 'playerID' order_by = 'yearID' expr = t.G.sum().over( ibis.trailing_window(3, order_by=t[order_by], group_by=t[group_by]) ) expr = t.mutate(rolled=expr) result = expr.execute() columns = [group_by, order_by, 'G'] expected = ( batting_df[columns] .set_index(order_by) .groupby(group_by) .G.rolling(3) .sum() .rename('rolled') ) tm.assert_series_equal( result.set_index([group_by, order_by]).sort_index().rolled, expected.sort_index(), )
( window(following=[5, 10]), 'rows between 5 following and 10 following', ), ( window(preceding=[10, 5]), 'rows between 10 preceding and 5 preceding', ), # # cumulative windows ( ibis.cumulative_window(), 'range between unbounded preceding and current row', ), # # trailing windows ( ibis.trailing_window(10), 'rows between 10 preceding and current row', ), ], ) def test_window_frame_specs(con, window, frame): t = con.table('alltypes') ex_template = """\ SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo` FROM ibis_testing.`alltypes`""" w2 = window.order_by(t.f) expr = t.projection([t.d.sum().over(w2).name('foo')]) expected = ex_template.format(frame.upper()) assert_sql_equal(expr, expected)
@pytest.mark.parametrize( 'window_fn', [ param( lambda t: ibis.window( preceding=2, following=0, group_by=[t.string_col], order_by=[t.id], ), id='preceding-2-following-0', ), param( lambda t: ibis.trailing_window( preceding=2, group_by=[t.string_col], order_by=[t.id] ), id='trailing-2', ), ], ) @pytest.mark.notimpl(["clickhouse", "dask", "datafusion"]) def test_grouped_bounded_preceding_window(backend, alltypes, df, window_fn): window = window_fn(alltypes) expr = alltypes.mutate(val=alltypes.double_col.sum().over(window)) result = expr.execute().set_index('id').sort_index() gdf = df.sort_values('id').groupby('string_col') expected = ( df.assign(
from pyspark.sql.window import Window from pytest import param import ibis pytest.importorskip('pyspark') pytestmark = pytest.mark.pyspark @pytest.mark.parametrize( ('ibis_window', 'spark_range'), [ param( ibis.trailing_window( preceding=ibis.interval(hours=1), order_by='time', group_by='key', ), (-3600, 0), ), param( ibis.trailing_window( preceding=ibis.interval(hours=2), order_by='time', group_by='key', ), (-7200, 0), ), param( ibis.range_window( preceding=0,
# These need to be tz-naive because the timestamp_col in # the test data is tz-naive return pd.Timestamp('20090105'), pd.Timestamp('20090111') def filter_by_time_context(df, context): return df[(df['timestamp_col'] >= context[0]) & (df['timestamp_col'] < context[1])] @pytest.mark.only_on_backends(['pandas', 'pyspark']) @pytest.mark.min_spark_version('3.1') @pytest.mark.parametrize( 'window', [ ibis.trailing_window(ibis.interval(days=3), order_by=ORDERBY_COL), ibis.trailing_window( ibis.interval(days=3), order_by=ORDERBY_COL, group_by=GROUPBY_COL, ), ], ) def test_context_adjustment_window_udf(alltypes, df, context, window): """ This test case aims to test context adjustment of udfs in window method. """ with option_context('context_adjustment.time_col', 'timestamp_col'): expr = alltypes.mutate(v1=calc_mean(alltypes[TARGET_COL]).over(window)) result = expr.execute(timecontext=context)