def setup(self): n = 30 * int(2e5) data = pd.DataFrame( { 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range( start='now', periods=n, freq='s' ).values, 'timestamp_strings': pd.date_range( start='now', periods=n, freq='s' ).values.astype(str), 'repeated_timestamps': pd.date_range( start='2018-09-01', periods=30 ).repeat(int(n / 30)), } ) t = ibis.pandas.connect({'df': data}).table('df') self.high_card_group_by = t.groupby(t.key).aggregate( avg_value=t.value.mean() ) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = ( t.mutate(dates=t.timestamps.cast('date')) .groupby(['low_card_key', 'dates']) .aggregate(avg_value=lambda t: t.value.mean()) ) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[ ['low_card_key', 'key', 'value'] ].sort_by(['low_card_key', 'key']) low_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over(low_card_window) high_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
def setup(self): n = 30 * int(2e5) data = pd.DataFrame({ 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range(start='now', periods=n, freq='s').values, 'timestamp_strings': pd.date_range(start='now', periods=n, freq='s').values.astype(str), 'repeated_timestamps': pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)), }) t = ibis.pandas.connect({'df': data}).table('df') self.high_card_group_by = t.groupby( t.key).aggregate(avg_value=t.value.mean()) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = (t.mutate( dates=t.timestamps.cast('date')).groupby( ['low_card_key', 'dates']).aggregate(avg_value=lambda t: t.value.mean())) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[['low_card_key', 'key', 'value' ]].sort_by(['low_card_key', 'key']) low_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over(low_card_window) high_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
def test_trailing_range_window_unsupported(alltypes, preceding, value): t = alltypes w = ibis.trailing_range_window(preceding=preceding, order_by=t.timestamp_col) expr = t.mutate(win_avg=t.float_col.mean().over(w)) with pytest.raises(ValueError): expr.compile()
def test_udaf_window_interval(): df = pd.DataFrame( collections.OrderedDict([ ( "time", pd.date_range(start='20190105', end='20190101', freq='-1D'), ), ("key", [1, 2, 1, 2, 1]), ("value", np.arange(5)), ])) con = connect({'df': df}) t = con.table('df') window = ibis.trailing_range_window(ibis.interval(days=2), order_by='time', group_by='key') expr = t.mutate(rolled=my_mean(t.value).over(window)) result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True) expected = (df.sort_values(['time', 'key']).set_index('time').assign( rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both'). mean().reset_index(level=0, drop=True))).reset_index(drop=False) tm.assert_frame_equal(result, expected)
def test_combine_window_with_interval_offset(alltypes): t = alltypes w1 = ibis.trailing_range_window(preceding=ibis.interval(days=3), order_by=t.e) w2 = ibis.trailing_range_window(preceding=ibis.interval(days=4), order_by=t.f) w3 = w1.combine(w2) expected = ibis.trailing_range_window(preceding=ibis.interval(days=3), order_by=[t.e, t.f]) assert_equal(w3, expected) w4 = ibis.range_window(following=ibis.interval(days=5), order_by=t.e) w5 = ibis.range_window(following=ibis.interval(days=7), order_by=t.f) expected = ibis.range_window(following=ibis.interval(days=5), order_by=[t.e, t.f]) w6 = w4.combine(w5) assert_equal(w6, expected)
def test_trailing_range_window_unsupported(alltypes, preceding, value): t = alltypes w = ibis.trailing_range_window( preceding=preceding, order_by=t.timestamp_col ) expr = t.mutate(win_avg=t.float_col.mean().over(w)) with pytest.raises(ValueError): expr.compile()
def test_trailing_range_window_unsupported(alltypes, preceding, value): if IBIS_VERSION <= IBIS_1_VERSION: pytest.skip("requires ibis 2.x") t = alltypes w = ibis.trailing_range_window(preceding=preceding, order_by=t.timestamp_col) expr = t.mutate(win_avg=t.float_col.mean().over(w)) with pytest.raises(ValueError): expr.compile()
def test_trailing_range_window(alltypes, preceding, value, project_id): t = alltypes w = ibis.trailing_range_window(preceding=preceding, order_by=t.timestamp_col) expr = t.mutate(win_avg=t.float_col.mean().over(w)) result = expr.compile() expected = f"""\ SELECT *, avg(`float_col`) OVER (ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN {value} PRECEDING AND CURRENT ROW) AS `win_avg` FROM `{project_id}.testing.functional_alltypes`""" # noqa: E501 assert result == expected
def test_trailing_range_window(alltypes, preceding, value, project_id): t = alltypes w = ibis.trailing_range_window( preceding=preceding, order_by=t.timestamp_col ) expr = t.mutate(win_avg=t.float_col.mean().over(w)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN {} PRECEDING AND CURRENT ROW) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 value, project_id ) assert result == expected
def setup(self): n = 30 * int(2e5) self.data = pd.DataFrame({ 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range(start='now', periods=n, freq='s').values, 'timestamp_strings': pd.date_range(start='now', periods=n, freq='s').values.astype(str), 'repeated_timestamps': pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)), }) t = ibis.pandas.connect({'df': self.data}).table('df') self.high_card_group_by = t.groupby( t.key).aggregate(avg_value=t.value.mean()) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = (t.mutate( dates=t.timestamps.cast('date')).groupby( ['low_card_key', 'dates']).aggregate(avg_value=lambda t: t.value.mean())) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[['low_card_key', 'key', 'value' ]].sort_by(['low_card_key', 'key']) low_card_rolling_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over( low_card_rolling_window) high_card_rolling_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over( high_card_rolling_window) @udf.reduction(['double'], 'double') def my_mean(series): return series.mean() self.low_card_grouped_rolling_udf_mean = my_mean( t.value).over(low_card_rolling_window) self.high_card_grouped_rolling_udf_mean = my_mean( t.value).over(high_card_rolling_window) @udf.analytic(['double'], 'double') def my_zscore(series): return (series - series.mean()) / series.std() low_card_window = ibis.window(group_by=t.low_card_key) high_card_window = ibis.window(group_by=t.key) self.low_card_window_analytics_udf = my_zscore( t.value).over(low_card_window) self.high_card_window_analytics_udf = my_zscore( t.value).over(high_card_window) @udf.reduction(['double', 'double'], 'double') def my_wm(v, w): return np.average(v, weights=w) self.low_card_grouped_rolling_udf_wm = my_wm( t.value, t.value).over(low_card_rolling_window) self.high_card_grouped_rolling_udf_wm = my_wm( t.value, t.value).over(low_card_rolling_window)
def high_card_rolling_window(t): return ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, )