Esempio n. 1
0
def test_window_with_mlb():
    index = pd.date_range('20170501', '20170507')
    data = np.random.randn(len(index), 3)
    df = (
        pd.DataFrame(data, columns=list('abc'), index=index)
        .rename_axis('time')
        .reset_index(drop=False)
    )
    client = Backend().connect({'df': df})
    t = client.table('df')
    rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10))
    expr = t.mutate(
        sum=lambda df: df.a.sum().over(
            ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b')
        )
    )
    result = expr.execute()
    expected = df.set_index('time')
    gb_df = (
        expected.groupby(['b'])['a']
        .rolling('10d', closed='both')
        .apply(lambda s: s.iloc[-5:].sum(), raw=False)
        .sort_index(level=['time'])
        .reset_index(drop=True)
    )
    expected = expected.reset_index(drop=False).assign(sum=gb_df)
    tm.assert_frame_equal(result, expected)

    rows_with_mlb = rows_with_max_lookback(5, 10)
    with pytest.raises(com.IbisInputError):
        t.mutate(
            sum=lambda df: df.a.sum().over(
                ibis.trailing_window(rows_with_mlb, order_by='time')
            )
        )
Esempio n. 2
0
def test_udaf_groupby():
    df = pd.DataFrame({
        'a':
        np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
        'b':
        np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
        'key':
        list('ddeefff'),
    })
    con = Backend().connect({'df': df})
    t = con.table('df')

    expr = t.groupby(t.key).aggregate(my_corr=my_corr(t.a, t.b))

    assert isinstance(expr, ir.TableExpr)

    result = expr.execute().sort_values('key')

    dfi = df.set_index('key')
    expected = pd.DataFrame({
        'key':
        list('def'),
        'my_corr':
        [dfi.loc[value, 'a'].corr(dfi.loc[value, 'b']) for value in 'def'],
    })

    columns = ['key', 'my_corr']
    tm.assert_frame_equal(result[columns], expected[columns])
Esempio n. 3
0
def test_udaf_window_interval():
    df = pd.DataFrame(
        collections.OrderedDict([
            (
                "time",
                pd.date_range(start='20190105', end='20190101', freq='-1D'),
            ),
            ("key", [1, 2, 1, 2, 1]),
            ("value", np.arange(5)),
        ]))

    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_range_window(ibis.interval(days=2),
                                        order_by='time',
                                        group_by='key')

    expr = t.mutate(rolled=my_mean(t.value).over(window))

    result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True)
    expected = (df.sort_values(['time', 'key']).set_index('time').assign(
        rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both').
        mean().reset_index(level=0, drop=True))).reset_index(drop=False)

    tm.assert_frame_equal(result, expected)
Esempio n. 4
0
def test_nullif_inf():
    df = pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]})
    con = Backend().connect({'t': df})
    t = con.table('t')
    expr = t.a.nullif(np.inf).nullif(-np.inf)
    result = expr.execute()
    expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name='a')
    tm.assert_series_equal(result, expected)
Esempio n. 5
0
def test_project_list_scalar():
    df = pd.DataFrame({'ints': range(3)})
    con = Backend().connect({'df': df})
    expr = con.table('df')
    result = expr.mutate(res=expr.ints.quantile([0.5, 0.95])).execute()
    tm.assert_series_equal(
        result.res, pd.Series([[1.0, 1.9] for _ in range(0, 3)], name='res')
    )
Esempio n. 6
0
def test_interval_arithmetic(op, expected):
    data = pd.timedelta_range('0 days', '10 days', freq='D')
    con = Backend().connect({
        'df1': pd.DataFrame({'td': data}),
        'df2': pd.DataFrame({'td': data})
    })
    t1 = con.table('df1')
    expr = op(t1.td, t1.td)
    result = expr.execute()
    expected = pd.Series(expected(data, data), name='td')
    tm.assert_series_equal(result, expected)
Esempio n. 7
0
def test_multiple_argument_udaf_window():
    # PR 2035

    @udf.reduction(['double', 'double'], 'double')
    def my_wm(v, w):
        return np.average(v, weights=w)

    df = pd.DataFrame(
        {
            'a': np.arange(4, 0, dtype=float, step=-1).tolist()
            + np.random.rand(3).tolist(),
            'b': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'c': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'd': np.repeat(1, 7),
            'key': list('deefefd'),
        }
    )
    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_window(2, order_by='a', group_by='key')
    window2 = ibis.trailing_window(1, order_by='b', group_by='key')
    expr = t.mutate(
        wm_b=my_wm(t.b, t.d).over(window),
        wm_c=my_wm(t.c, t.d).over(window),
        wm_c2=my_wm(t.c, t.d).over(window2),
    )
    result = expr.execute().sort_values(['key', 'a'])
    expected = (
        df.sort_values(['key', 'a'])
        .assign(
            wm_b=lambda df: df.groupby('key')
            .b.rolling(3, min_periods=1)
            .mean()
            .reset_index(level=0, drop=True)
        )
        .assign(
            wm_c=lambda df: df.groupby('key')
            .c.rolling(3, min_periods=1)
            .mean()
            .reset_index(level=0, drop=True)
        )
    )
    expected = expected.sort_values(['key', 'b']).assign(
        wm_c2=lambda df: df.groupby('key')
        .c.rolling(2, min_periods=1)
        .mean()
        .reset_index(level=0, drop=True)
    )
    expected = expected.sort_values(['key', 'a'])

    tm.assert_frame_equal(result, expected)
Esempio n. 8
0
def test_select_on_unambiguous_join(how, func):
    df_t = pd.DataFrame({'a0': [1, 2, 3], 'b1': list("aab")})
    df_s = pd.DataFrame({'a1': [2, 3, 4], 'b2': list("abc")})
    con = Backend().connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    method = getattr(t, f"{how}_join")
    join = method(s, t.b1 == s.b2)
    expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"],
                        how=how)[["a0", "a1"]]
    assert not expected.empty
    expr = func(join)
    result = expr.execute()
    tm.assert_frame_equal(result, expected)
Esempio n. 9
0
def test_udaf_window_nan():
    df = pd.DataFrame({
        'a': np.arange(10, dtype=float),
        'b': [3.0, np.NaN] * 5,
        'key': list('ddeefffggh'),
    })
    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_window(2, order_by='a', group_by='key')
    expr = t.mutate(rolled=my_mean(t.b).over(window))
    result = expr.execute().sort_values(['key', 'a'])
    expected = df.sort_values(['key', 'a']).assign(
        rolled=lambda d: d.groupby('key').b.rolling(3, min_periods=1).apply(
            lambda x: x.mean(), raw=True).reset_index(level=0, drop=True))
    tm.assert_frame_equal(result, expected)
Esempio n. 10
0
def test_select_on_unambiguous_asof_join(func):
    df_t = pd.DataFrame({
        'a0': [1, 2, 3],
        'b1': pd.date_range("20180101", periods=3)
    })
    df_s = pd.DataFrame({
        'a1': [2, 3, 4],
        'b2': pd.date_range("20171230", periods=3)
    })
    con = Backend().connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    join = t.asof_join(s, t.b1 == s.b2)
    expected = pd.merge_asof(df_t, df_s, left_on=["b1"],
                             right_on=["b2"])[["a0", "a1"]]
    assert not expected.empty
    expr = func(join)
    result = expr.execute()
    tm.assert_frame_equal(result, expected)
Esempio n. 11
0
def test_window_with_preceding_expr(index):
    time = pd.date_range('20180101', '20180110')
    start = 2
    data = np.arange(start, start + len(time))
    df = pd.DataFrame({'value': data, 'time': time}, index=index(time))
    client = Backend().connect({'df': df})
    t = client.table('df')
    expected = (
        df.set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
        .reset_index(drop=True)
    )
    expected.index.name = None
    day = ibis.interval(days=1)
    window = ibis.trailing_window(3 * day, order_by=t.time)
    expr = t.value.mean().over(window)
    result = expr.execute()
    tm.assert_series_equal(result, expected)
Esempio n. 12
0
def test_window_has_pre_execute_scope():
    signature = ops.Lag, Backend
    called = [0]

    @pre_execute.register(*signature)
    def test_pre_execute(op, client, **kwargs):
        called[0] += 1
        return Scope()

    data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')}
    df = pd.DataFrame(data, columns=['key', 'value', 'dup'])
    client = Backend().connect({'df': df})
    t = client.table('df')
    window = ibis.window(order_by='value')
    expr = t.key.lag(1).over(window).name('foo')
    result = expr.execute()
    assert result is not None

    # once in window op at the top to pickup any scope changes before computing
    # twice in window op when calling execute on the ops.Lag node at the
    # beginning of execute and once before the actual computation
    assert called[0] == 3