def setup(self): n = 30 * int(2e5) data = pd.DataFrame({ 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range(start='now', periods=n, freq='s').values, 'timestamp_strings': pd.date_range(start='now', periods=n, freq='s').values.astype(str), 'repeated_timestamps': pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)) }) t = ibis.pandas.connect({'df': data}).table('df') self.high_card_group_by = t.groupby( t.key).aggregate(avg_value=t.value.mean()) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = t.mutate( dates=t.timestamps.cast('date')).groupby( ['low_card_key', 'dates']).aggregate(avg_value=lambda t: t.value.mean()) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[['low_card_key', 'key', 'value' ]].sort_by(['low_card_key', 'key']) low_card_window = ibis.trailing_range_window( 2 * ibis.day(), order_by=t.repeated_timestamps, group_by=t.low_card_key) self.low_card_grouped_rolling = t.value.mean().over(low_card_window) high_card_window = ibis.trailing_range_window( 2 * ibis.day(), order_by=t.repeated_timestamps, group_by=t.key) self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
def test_timestamp_scalar_in_filter(alltypes, translate): table = alltypes expr = (table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.week(3)), table.timestamp_col < (ibis.now() + ibis.day(10)) ]).count()) expr.execute()
def test_keyed_asof_join_with_tolerance( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=2 * ibis.day()) result = expr.execute() expected = pd.merge_asof( time_keyed_df1, time_keyed_df2, on='time', by='key', tolerance=pd.Timedelta('2D')) tm.assert_frame_equal(result[expected.columns], expected)
def test_timestamp_scalar_in_filter(self): # #310 table = self.alltypes expr = (table.filter([table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.month(3)), table.timestamp_col < (ibis.now() + ibis.day(10)) ]) .count()) expr.execute()
def test_window_with_preceding_expr(): index = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(index)) df = pd.DataFrame({'value': data, 'time': index}, index=index) client = ibis.pandas.connect({'df': df}) t = client.table('df') expected = df.set_index('time').value.rolling('3d').mean() expected.index.name = None day = ibis.day() window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
def test_where_analyze_scalar_op(self): # root cause of #310 table = self.con.table('functional_alltypes') expr = (table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.month(3)), table.timestamp_col < (ibis.now() + ibis.day(10)) ]).count()) result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND `timestamp_col` < days_add(now(), 10)""" assert result == expected
def test_where_analyze_scalar_op(self): # root cause of #310 table = self.con.table('functional_alltypes') expr = (table.filter([table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.month(3)), table.timestamp_col < (ibis.now() + ibis.day(10))]) .count()) result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND `timestamp_col` < days_add(now(), 10)""" assert result == expected
def test_comparison_timestamp(self): expr = self.col > (self.col.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanArray)
result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (PARTITION BY `year` ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN 4 PRECEDING AND 2 PRECEDING) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format(project_id) # noqa: E501 assert result == expected @pytest.mark.parametrize(('preceding', 'value'), [ (5, 5), (ibis.nanosecond(), 0.001), (ibis.microsecond(), 1), (ibis.second(), 1000000), (ibis.minute(), 1000000 * 60), (ibis.hour(), 1000000 * 60 * 60), (ibis.day(), 1000000 * 60 * 60 * 24), (2 * ibis.day(), 1000000 * 60 * 60 * 24 * 2), (ibis.week(), 1000000 * 60 * 60 * 24 * 7), ]) def test_trailing_range_window(alltypes, preceding, value, project_id): t = alltypes w = ibis.trailing_range_window(preceding=preceding, order_by=t.timestamp_col) expr = t.mutate(win_avg=t.float_col.mean().over(w)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN {} PRECEDING AND CURRENT ROW) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 value, project_id) assert result == expected
def test_comparison_timestamp(alltypes): expr = alltypes.i > (alltypes.i.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanColumn)
execute = ibis.pandas.execute pytestmark = pytest.mark.pandas @pytest.fixture(scope='session') def sort_kind(): return 'mergesort' default = pytest.mark.parametrize('default', [ibis.NA, ibis.literal('a')]) row_offset = pytest.mark.parametrize('row_offset', list(map(ibis.literal, [-1, 1, 0]))) delta_offset = pytest.mark.parametrize( 'delta_offset', [ibis.day(), 2 * ibis.day(), -2 * ibis.day()]) @default @row_offset def test_lead(t, df, row_offset, default): expr = t.dup_strings.lead(row_offset, default=default) result = expr.execute() expected = df.dup_strings.shift(-execute(row_offset)) if default is not ibis.NA: expected = expected.fillna(execute(default)) tm.assert_series_equal(result, expected) @default @row_offset
execute = ibis.pandas.execute pytestmark = pytest.mark.pandas @pytest.fixture(scope='session') def sort_kind(): return 'mergesort' default = pytest.mark.parametrize('default', [ibis.NA, ibis.literal('a')]) row_offset = pytest.mark.parametrize( 'row_offset', list(map(ibis.literal, [-1, 1, 0]))) range_offset = pytest.mark.parametrize( 'range_offset', [ibis.day(), 2 * ibis.day(), -2 * ibis.day()] ) @pytest.fixture def row_window(): return ibis.window(following=0, order_by='plain_int64') @pytest.fixture def range_window(): return ibis.window(following=0, order_by='plain_datetimes_naive') @default @row_offset