def test_window_with_mlb(): index = pd.date_range('20170501', '20170507') data = np.random.randn(len(index), 3) df = ( pd.DataFrame(data, columns=list('abc'), index=index) .rename_axis('time') .reset_index(drop=False) ) client = Backend().connect({'df': df}) t = client.table('df') rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10)) expr = t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b') ) ) result = expr.execute() expected = df.set_index('time') gb_df = ( expected.groupby(['b'])['a'] .rolling('10d', closed='both') .apply(lambda s: s.iloc[-5:].sum(), raw=False) .sort_index(level=['time']) .reset_index(drop=True) ) expected = expected.reset_index(drop=False).assign(sum=gb_df) tm.assert_frame_equal(result, expected) rows_with_mlb = rows_with_max_lookback(5, 10) with pytest.raises(com.IbisInputError): t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time') ) )
def test_udaf_window_interval(): df = pd.DataFrame( collections.OrderedDict([ ( "time", pd.date_range(start='20190105', end='20190101', freq='-1D'), ), ("key", [1, 2, 1, 2, 1]), ("value", np.arange(5)), ])) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_range_window(ibis.interval(days=2), order_by='time', group_by='key') expr = t.mutate(rolled=my_mean(t.value).over(window)) result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True) expected = (df.sort_values(['time', 'key']).set_index('time').assign( rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both'). mean().reset_index(level=0, drop=True))).reset_index(drop=False) tm.assert_frame_equal(result, expected)
def test_udaf_groupby(): df = pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), }) con = Backend().connect({'df': df}) t = con.table('df') expr = t.groupby(t.key).aggregate(my_corr=my_corr(t.a, t.b)) assert isinstance(expr, ir.TableExpr) result = expr.execute().sort_values('key') dfi = df.set_index('key') expected = pd.DataFrame({ 'key': list('def'), 'my_corr': [dfi.loc[value, 'a'].corr(dfi.loc[value, 'b']) for value in 'def'], }) columns = ['key', 'my_corr'] tm.assert_frame_equal(result[columns], expected[columns])
def test_nullif_inf(): df = pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}) con = Backend().connect({'t': df}) t = con.table('t') expr = t.a.nullif(np.inf).nullif(-np.inf) result = expr.execute() expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name='a') tm.assert_series_equal(result, expected)
def test_project_list_scalar(): df = pd.DataFrame({'ints': range(3)}) con = Backend().connect({'df': df}) expr = con.table('df') result = expr.mutate(res=expr.ints.quantile([0.5, 0.95])).execute() tm.assert_series_equal( result.res, pd.Series([[1.0, 1.9] for _ in range(0, 3)], name='res') )
def test_interval_arithmetic(op, expected): data = pd.timedelta_range('0 days', '10 days', freq='D') con = Backend().connect({ 'df1': pd.DataFrame({'td': data}), 'df2': pd.DataFrame({'td': data}) }) t1 = con.table('df1') expr = op(t1.td, t1.td) result = expr.execute() expected = pd.Series(expected(data, data), name='td') tm.assert_series_equal(result, expected)
def test_multiple_argument_udaf_window(): # PR 2035 @udf.reduction(['double', 'double'], 'double') def my_wm(v, w): return np.average(v, weights=w) df = pd.DataFrame( { 'a': np.arange(4, 0, dtype=float, step=-1).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'c': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'd': np.repeat(1, 7), 'key': list('deefefd'), } ) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_window(2, order_by='a', group_by='key') window2 = ibis.trailing_window(1, order_by='b', group_by='key') expr = t.mutate( wm_b=my_wm(t.b, t.d).over(window), wm_c=my_wm(t.c, t.d).over(window), wm_c2=my_wm(t.c, t.d).over(window2), ) result = expr.execute().sort_values(['key', 'a']) expected = ( df.sort_values(['key', 'a']) .assign( wm_b=lambda df: df.groupby('key') .b.rolling(3, min_periods=1) .mean() .reset_index(level=0, drop=True) ) .assign( wm_c=lambda df: df.groupby('key') .c.rolling(3, min_periods=1) .mean() .reset_index(level=0, drop=True) ) ) expected = expected.sort_values(['key', 'b']).assign( wm_c2=lambda df: df.groupby('key') .c.rolling(2, min_periods=1) .mean() .reset_index(level=0, drop=True) ) expected = expected.sort_values(['key', 'a']) tm.assert_frame_equal(result, expected)
def test_select_on_unambiguous_join(how, func): df_t = pd.DataFrame({'a0': [1, 2, 3], 'b1': list("aab")}) df_s = pd.DataFrame({'a1': [2, 3, 4], 'b2': list("abc")}) con = Backend().connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") method = getattr(t, f"{how}_join") join = method(s, t.b1 == s.b2) expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"], how=how)[["a0", "a1"]] assert not expected.empty expr = func(join) result = expr.execute() tm.assert_frame_equal(result, expected)
def test_udaf_window_nan(): df = pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), }) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_window(2, order_by='a', group_by='key') expr = t.mutate(rolled=my_mean(t.b).over(window)) result = expr.execute().sort_values(['key', 'a']) expected = df.sort_values(['key', 'a']).assign( rolled=lambda d: d.groupby('key').b.rolling(3, min_periods=1).apply( lambda x: x.mean(), raw=True).reset_index(level=0, drop=True)) tm.assert_frame_equal(result, expected)
def client( df, df1, df2, df3, time_df1, time_df2, time_df3, time_keyed_df1, time_keyed_df2, intersect_df2, ): return Backend().connect({ 'df': df, 'df1': df1, 'df2': df2, 'df3': df3, 'left': df1, 'right': df2, 'time_df1': time_df1, 'time_df2': time_df2, 'time_df3': time_df3, 'time_keyed_df1': time_keyed_df1, 'time_keyed_df2': time_keyed_df2, 'intersect_df2': intersect_df2, })
def test_mutate_with_window_after_join(sort_kind): left_df = pd.DataFrame({ 'ints': [0, 1, 2], 'strings': ['a', 'b', 'c'], 'dates': pd.date_range('20170101', periods=3), }) right_df = pd.DataFrame({ 'group': [0, 1, 2] * 3, 'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], }) con = Backend().connect({'left': left_df, 'right': right_df}) left, right = map(con.table, ('left', 'right')) joined = left.outer_join(right, left.ints == right.group) proj = joined[left, right.value] expr = proj.groupby('ints').mutate(sum=proj.value.sum()) result = expr.execute() expected = pd.DataFrame({ 'dates': pd.concat([left_df.dates] * 3).sort_values(kind=sort_kind).reset_index(drop=True), 'ints': [0] * 3 + [1] * 3 + [2] * 3, 'strings': ['a'] * 3 + ['b'] * 3 + ['c'] * 3, 'value': [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], 'sum': [9.0] * 3 + [12.0] * 3 + [8.0] * 3, }) tm.assert_frame_equal(result[expected.columns], expected)
def test_window_with_preceding_expr(index): time = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(time)) df = pd.DataFrame({'value': data, 'time': time}, index=index(time)) client = Backend().connect({'df': df}) t = client.table('df') expected = ( df.set_index('time') .value.rolling('3d', closed='both') .mean() .reset_index(drop=True) ) expected.index.name = None day = ibis.interval(days=1) window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
def test_select_on_unambiguous_asof_join(func): df_t = pd.DataFrame({ 'a0': [1, 2, 3], 'b1': pd.date_range("20180101", periods=3) }) df_s = pd.DataFrame({ 'a1': [2, 3, 4], 'b2': pd.date_range("20171230", periods=3) }) con = Backend().connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") join = t.asof_join(s, t.b1 == s.b2) expected = pd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] assert not expected.empty expr = func(join) result = expr.execute() tm.assert_frame_equal(result, expected)
def struct_client(value): df = pd.DataFrame({ "s": [ OrderedDict([("fruit", "apple"), ("weight", None)]), value, OrderedDict([("fruit", "pear"), ("weight", 1)]), ], "key": list("aab"), "value": [1, 2, 3], }) return Backend().connect({"t": df})
def test_window_has_pre_execute_scope(): signature = ops.Lag, Backend called = [0] @pre_execute.register(*signature) def test_pre_execute(op, client, **kwargs): called[0] += 1 return Scope() data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')} df = pd.DataFrame(data, columns=['key', 'value', 'dup']) client = Backend().connect({'df': df}) t = client.table('df') window = ibis.window(order_by='value') expr = t.key.lag(1).over(window).name('foo') result = expr.execute() assert result is not None # once in window op at the top to pickup any scope changes before computing # twice in window op when calling execute on the ops.Lag node at the # beginning of execute and once before the actual computation assert called[0] == 3
def test_from_dataframe(dataframe, ibis_table, core_client): t = Backend().from_dataframe(dataframe) result = t.execute() expected = ibis_table.execute() tm.assert_frame_equal(result, expected) t = Backend().from_dataframe(dataframe, name='foo') expected = ibis_table.execute() tm.assert_frame_equal(result, expected) client = core_client t = Backend().from_dataframe(dataframe, name='foo', client=client) expected = ibis_table.execute() tm.assert_frame_equal(result, expected)
def test_project_scalar_after_join(): left_df = pd.DataFrame({'ints': range(3)}) right_df = pd.DataFrame( { 'group': [0, 1, 2] * 3, 'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], } ) con = Backend().connect({'left': left_df, 'right': right_df}) left, right = map(con.table, ('left', 'right')) joined = left.outer_join(right, left.ints == right.group) proj = joined[left, right.value] expr = proj[proj.value.sum().name('sum'), ibis.literal(1).name('const')] result = expr.execute() expected = pd.DataFrame({'sum': [29.0] * 9, 'const': [1] * 9}) tm.assert_frame_equal(result[expected.columns], expected)
def test_mutate_scalar_with_window_after_join(): left_df = pd.DataFrame({'ints': range(3)}) right_df = pd.DataFrame({ 'group': [0, 1, 2] * 3, 'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], }) con = Backend().connect({'left': left_df, 'right': right_df}) left, right = map(con.table, ('left', 'right')) joined = left.outer_join(right, left.ints == right.group) proj = joined[left, right.value] expr = proj.mutate(sum=proj.value.sum(), const=1) result = expr.execute() expected = pd.DataFrame({ 'ints': [0] * 3 + [1] * 3 + [2] * 3, 'value': [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], 'sum': [29.0] * 9, 'const': [1] * 9, }) tm.assert_frame_equal(result[expected.columns], expected)
def con(df, df2): return Backend().connect({'df': df, 'df2': df2})
def lahman(batting_df, awards_players_df): return Backend().connect({ 'batting': batting_df, 'awards_players': awards_players_df })
def core_client(dataframe): return Backend().connect({'df': dataframe})