def test_batting_cumulative_partitioned(batting, batting_df, sort_kind): group_by = 'playerID' order_by = 'yearID' t = batting expr = t.G.sum().over( ibis.cumulative_window(order_by=order_by, group_by=group_by) ) expr = t.mutate(cumulative=expr) result = expr.execute() columns = [group_by, order_by, 'G'] expected = ( batting_df[columns] .set_index(order_by) .groupby(group_by) .G.expanding() .sum() .rename('cumulative') ) tm.assert_series_equal( result.set_index([group_by, order_by]).sort_index().cumulative, expected.sort_index(), )
def test_batting_most_hits(players, players_df): expr = players.mutate(hits_rank=lambda t: t.H.rank().over( ibis.cumulative_window(order_by=ibis.desc(t.H)))) result = expr.execute() hits_rank = players_df.groupby('playerID').H.rank(method='min', ascending=False) expected = players_df.assign(hits_rank=hits_rank) tm.assert_frame_equal(result[expected.columns], expected)
def test_cumulative_simple_window(alltypes, func, df): t = alltypes f = getattr(t.double_col, func) col = t.double_col - f().over(ibis.cumulative_window()) expr = t.projection([col.name('double_col')]) result = expr.execute().double_col expected = df.double_col - getattr(df.double_col, 'cum%s' % func)() tm.assert_series_equal(result, expected)
def test_combine_preserves_existing_window(): t = ibis.table( [('one', 'string'), ('two', 'double'), ('three', 'int32')], name='my_data', ) w = ibis.cumulative_window(order_by=t.one) mut = t.group_by(t.three).mutate(four=t.two.sum().over(w)) assert mut.op().selections[1].op().window.following == 0
def test_cumulative_simple_window(self): t = self.alltypes df = t.execute() for func in 'sum min max'.split(): f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(ibis.cumulative_window())).name('double_col')]) result = expr.execute().double_col expected = df.double_col - getattr(df.double_col, 'cum%s' % func)() tm.assert_series_equal(result, expected)
def test_cumulative_ordered_window(alltypes, func, df): t = alltypes df = df.sort_values('timestamp_col').reset_index(drop=True) window = ibis.cumulative_window(order_by=t.timestamp_col) f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(window)).name('double_col')]) result = expr.execute().double_col expected = df.double_col - getattr(df.double_col, 'cum%s' % func)() tm.assert_series_equal(result, expected)
def _cumulative_to_window(expr, window): win = ibis.cumulative_window() win = (win.group_by(window._group_by).order_by(window._order_by)) op = expr.op() klass = _cumulative_to_reduction[type(op)] new_op = klass(*op.args) return expr._factory(new_op, name=expr._name), win
def test_cumulative_ordered_window(self): t = self.alltypes df = t.execute().sort_values('timestamp_col').reset_index(drop=True) window = ibis.cumulative_window(order_by=t.timestamp_col) for func in 'sum min max'.split(): f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(window)).name('double_col')]) result = expr.execute().double_col expected = df.double_col - getattr(df.double_col, 'cum%s' % func)() tm.assert_series_equal(result, expected)
def test_cumulative_partitioned_ordered_window(self): t = self.alltypes df = t.execute().sort_values(['string_col', 'timestamp_col']).reset_index(drop=True) window = ibis.cumulative_window(order_by=t.timestamp_col, group_by=t.string_col) for func in 'sum min max'.split(): f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(window)).name('double_col')]) result = expr.execute().double_col expected = df.groupby(df.string_col).double_col.transform(lambda c: c - getattr(c, 'cum%s' % func)()) tm.assert_series_equal(result, expected)
def test_batting_cumulative(batting, batting_df, sort_kind): expr = batting.mutate(more_values=lambda t: t.G.sum().over( ibis.cumulative_window(order_by=t.yearID))) result = expr.execute() columns = ['G', 'yearID'] more_values = (batting_df[columns].sort_values( 'yearID', kind=sort_kind).G.expanding().sum().astype('int64')) expected = batting_df.assign(more_values=more_values) tm.assert_frame_equal(result[expected.columns], expected)
def _cumulative_to_window(expr, window): win = ibis.cumulative_window() win = (win.group_by(window._group_by) .order_by(window._order_by)) op = expr.op() klass = _cumulative_to_reduction[type(op)] new_op = klass(*op.args) return expr._factory(new_op, name=expr._name), win
def test_cumulative_partitioned_window(alltypes, func, df): t = alltypes df = df.sort_values('string_col').reset_index(drop=True) window = ibis.cumulative_window(group_by=t.string_col) f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(window)).name('double_col')]) result = expr.execute().double_col expected = df.groupby(df.string_col).double_col.transform( lambda c: c - getattr(c, 'cum%s' % func)()) tm.assert_series_equal(result, expected)
def test_batting_most_hits(players, players_df): expr = players.mutate( hits_rank=lambda t: t.H.rank().over( ibis.cumulative_window(order_by=ibis.desc(t.H)) ) ) result = expr.execute() hits_rank = players_df.groupby('playerID').H.rank( method='min', ascending=False ) expected = players_df.assign(hits_rank=hits_rank) tm.assert_frame_equal(result[expected.columns], expected)
def test_cumulative_functions(alltypes, cumulative, static): t = alltypes w = ibis.window(order_by=t.d) actual = cumulative(t, w).name('foo') expected = static(t, w).over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) assert to_sql(expr1) == to_sql(expr2)
def test_cumulative_functions(alltypes, cumulative, static): t = alltypes w = ibis.window(order_by=t.d) actual = cumulative(t, w).name('foo') expected = static(t, w).over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) assert ImpalaCompiler.to_sql(expr1) == ImpalaCompiler.to_sql(expr2)
def test_cumulative_partitioned_ordered_window(alltypes, func, df): t = alltypes df = df.sort_values(['string_col', 'timestamp_col']).reset_index(drop=True) window = ibis.cumulative_window(order_by=t.timestamp_col, group_by=t.string_col) f = getattr(t.double_col, func) expr = t.projection([(t.double_col - f().over(window)).name('double_col')]) result = expr.execute().double_col method = operator.methodcaller(f'cum{func}') expected = df.groupby( df.string_col).double_col.transform(lambda c: c - method(c)) tm.assert_series_equal(result, expected)
def test_group_by_with_window_preserves_range(): t = ibis.table( [('one', 'string'), ('two', 'double'), ('three', 'int32')], name='my_data', ) w = ibis.cumulative_window(order_by=t.one) expr = t.group_by(t.three).mutate(four=t.two.sum().over(w)) result = ibis.impala.compile(expr) expected = """\ SELECT *, sum(`two`) OVER (PARTITION BY `three` ORDER BY `one` RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `four` FROM my_data""" # noqa: E501 assert result == expected
def _cumulative_to_window(translator, expr, window): win = ibis.cumulative_window() win = win.group_by(window._group_by).order_by(window._order_by) op = expr.op() klass = _cumulative_to_reduction[type(op)] new_op = klass(*op.args) new_expr = expr._factory(new_op, name=expr._name) if type(new_op) in translator._rewrites: new_expr = translator._rewrites[type(new_op)](new_expr) new_expr = L.windowize_function(new_expr, win) return new_expr
def test_window_function(time_data): my_table = query("""SELECT count, duration_seconds, SUM(duration_seconds) OVER (PARTITION BY person) AS running_total, COUNT(duration_seconds) OVER (PARTITION BY person) AS running_count, AVG(duration_seconds) OVER (PARTITION BY person) AS running_avg FROM time_data""").execute() ibis_table = time_data.projection([ time_data.get_column("count"), time_data.duration_seconds, time_data.duration_seconds.sum().over( ibis.cumulative_window( group_by=time_data.person)).name("running_total"), time_data.duration_seconds.count().over( ibis.cumulative_window( group_by=time_data.person)).name("running_count"), time_data.duration_seconds.mean().over( ibis.cumulative_window( group_by=time_data.person)).name("running_avg"), ]).execute() assert_frame_equal(ibis_table, my_table)
def test_batting_cumulative_partitioned(batting, batting_df): expr = batting.mutate(more_values=lambda t: t.G.sum().over( ibis.cumulative_window(order_by=t.yearID, group_by=t.lgID))) result = expr.execute().more_values columns = ['G', 'yearID', 'lgID'] key = 'lgID' expected_result = batting_df[columns].groupby( key, sort=False, as_index=False).apply(lambda df: df.sort_values('yearID')).groupby( key, sort=False).G.cumsum().sort_index(level=-1) expected = expected_result.reset_index(list( range(expected_result.index.nlevels - 1)), drop=True).reindex(batting_df.index) expected.name = result.name tm.assert_series_equal(result, expected)
def test_batting_cumulative_partitioned(batting, batting_df, sort_kind): group_by = 'playerID' order_by = 'yearID' t = batting expr = t.G.sum().over( ibis.cumulative_window(order_by=order_by, group_by=group_by)) expr = t.mutate(cumulative=expr) result = expr.execute() columns = [group_by, order_by, 'G'] expected = batting_df[columns].set_index( order_by).groupby(group_by).G.expanding().sum().rename('cumulative') tm.assert_series_equal( result.set_index([group_by, order_by]).sort_index().cumulative, expected.sort_index())
def cumulative_to_window(translator, expr, window): win = ibis.cumulative_window() win = win.group_by(window._group_by).order_by(window._order_by) op = expr.op() klass = _cumulative_to_reduction[type(op)] new_op = klass(*op.args) new_expr = new_op.to_expr() if expr.has_name(): new_expr = new_expr.name(expr.get_name()) if type(new_op) in translator._rewrites: new_expr = translator._rewrites[type(new_op)](new_expr) new_expr = L.windowize_function(new_expr, win) return new_expr
def test_window_frame_specs(self): t = self.con.table('alltypes') ex_template = """\ SELECT sum(d) OVER (ORDER BY f {0}) AS `foo` FROM alltypes""" cases = [ (window(preceding=0), 'range between current row and unbounded following'), (window(following=0), 'range between unbounded preceding and current row'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(preceding=5, following=0), 'rows between 5 preceding and current row'), (window(preceding=5, following=2), 'rows between 5 preceding and 2 following'), (window(following=2), 'rows between unbounded preceding and 2 following'), (window(following=2, preceding=0), 'rows between current row and 2 following'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(following=[5, 10]), 'rows between 5 following and 10 following'), (window(preceding=[10, 5]), 'rows between 10 preceding and 5 preceding'), # # cumulative windows (ibis.cumulative_window(), 'range between unbounded preceding and current row'), # # trailing windows (ibis.trailing_window(10), 'rows between 10 preceding and current row'), ] for w, frame in cases: w2 = w.order_by(t.f) expr = t.projection([t.d.sum().over(w2).name('foo')]) expected = ex_template.format(frame.upper()) self._check_sql(expr, expected)
def test_batting_cumulative(batting, batting_df, sort_kind): expr = batting.mutate( more_values=lambda t: t.G.sum().over( ibis.cumulative_window(order_by=t.yearID) ) ) result = expr.execute() columns = ['G', 'yearID'] more_values = ( batting_df[columns] .sort_values('yearID', kind=sort_kind) .G.expanding() .sum() ) expected = batting_df.assign(more_values=more_values) tm.assert_frame_equal(result[expected.columns], expected)
def test_complex_window(client): """ Test window with different sizes mix context adjustment for window op that require context adjustment and non window op that doesn't adjust context """ table = client.table('time_indexed_table') context = ( pd.Timestamp('20170102 07:00:00', tz='UTC'), pd.Timestamp('20170105', tz='UTC'), ) window = ibis.trailing_window(preceding=ibis.interval(hours=1), order_by='time', group_by='key') window2 = ibis.trailing_window(preceding=ibis.interval(hours=2), order_by='time', group_by='key') window_cum = ibis.cumulative_window(order_by='time', group_by='key') # context should be adjusted accordingly for each window result_pd = (table.mutate( count_1h=table['value'].count().over(window), count_2h=table['value'].count().over(window2), count_cum=table['value'].count().over(window_cum), ).mutate(count=table['value'].count()).execute(timecontext=context)) df = table.execute() expected_win_1h = (df.set_index('time').groupby('key').value.rolling( '1h', closed='both').count().rename('count_1h').astype(int)) expected_win_2h = (df.set_index('time').groupby('key').value.rolling( '2h', closed='both').count().rename('count_2h').astype(int)) expected_cum_win = (df.set_index('time').groupby( 'key').value.expanding().count().rename('count_cum').astype(int)) df = df.set_index('time') df = df.assign(count_1h=expected_win_1h.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df = df.assign(count_2h=expected_win_2h.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df = df.assign(count_cum=expected_cum_win.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df['count'] = df.groupby(['key'])['value'].transform('count') df = df.reset_index() expected = (df[df.time.between(*(t.tz_convert(None) for t in context))].sort_values( ['key']).reset_index(drop=True)) tm.assert_frame_equal(result_pd, expected)
def test_cumulative_functions(con): t = con.table('alltypes') w = ibis.window(order_by=t.d) exprs = [ (t.f.cumsum().over(w), t.f.sum().over(w)), (t.f.cummin().over(w), t.f.min().over(w)), (t.f.cummax().over(w), t.f.max().over(w)), (t.f.cummean().over(w), t.f.mean().over(w)), ] for cumulative, static in exprs: actual = cumulative.name('foo') expected = static.over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) assert to_sql(expr1) == to_sql(expr2)
def test_cumulative_functions(self): t = self.con.table('alltypes') w = ibis.window(order_by=t.d) exprs = [ (t.f.cumsum().over(w), t.f.sum().over(w)), (t.f.cummin().over(w), t.f.min().over(w)), (t.f.cummax().over(w), t.f.max().over(w)), (t.f.cummean().over(w), t.f.mean().over(w)), ] for cumulative, static in exprs: actual = cumulative.name('foo') expected = static.over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) self._compare_sql(expr1, expr2)
), ( window(preceding=5), 'rows between 5 preceding and unbounded following', ), ( window(following=[5, 10]), 'rows between 5 following and 10 following', ), ( window(preceding=[10, 5]), 'rows between 10 preceding and 5 preceding', ), # # cumulative windows ( ibis.cumulative_window(), 'range between unbounded preceding and current row', ), # # trailing windows ( ibis.trailing_window(10), 'rows between 10 preceding and current row', ), ], ) def test_window_frame_specs(con, window, frame): t = con.table('alltypes') ex_template = """\ SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo` FROM ibis_testing.`alltypes`"""