Ejemplo n.º 1
0
def test_window_with_mlb():
    index = pd.date_range('20170501', '20170507')
    data = np.random.randn(len(index), 3)
    df = (
        pd.DataFrame(data, columns=list('abc'), index=index)
        .rename_axis('time')
        .reset_index(drop=False)
    )
    client = Backend().connect({'df': df})
    t = client.table('df')
    rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10))
    expr = t.mutate(
        sum=lambda df: df.a.sum().over(
            ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b')
        )
    )
    result = expr.execute()
    expected = df.set_index('time')
    gb_df = (
        expected.groupby(['b'])['a']
        .rolling('10d', closed='both')
        .apply(lambda s: s.iloc[-5:].sum(), raw=False)
        .sort_index(level=['time'])
        .reset_index(drop=True)
    )
    expected = expected.reset_index(drop=False).assign(sum=gb_df)
    tm.assert_frame_equal(result, expected)

    rows_with_mlb = rows_with_max_lookback(5, 10)
    with pytest.raises(com.IbisInputError):
        t.mutate(
            sum=lambda df: df.a.sum().over(
                ibis.trailing_window(rows_with_mlb, order_by='time')
            )
        )
Ejemplo n.º 2
0
def test_udaf_window_interval():
    df = pd.DataFrame(
        collections.OrderedDict([
            (
                "time",
                pd.date_range(start='20190105', end='20190101', freq='-1D'),
            ),
            ("key", [1, 2, 1, 2, 1]),
            ("value", np.arange(5)),
        ]))

    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_range_window(ibis.interval(days=2),
                                        order_by='time',
                                        group_by='key')

    expr = t.mutate(rolled=my_mean(t.value).over(window))

    result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True)
    expected = (df.sort_values(['time', 'key']).set_index('time').assign(
        rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both').
        mean().reset_index(level=0, drop=True))).reset_index(drop=False)

    tm.assert_frame_equal(result, expected)
Ejemplo n.º 3
0
def test_udaf_groupby():
    df = pd.DataFrame({
        'a':
        np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
        'b':
        np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
        'key':
        list('ddeefff'),
    })
    con = Backend().connect({'df': df})
    t = con.table('df')

    expr = t.groupby(t.key).aggregate(my_corr=my_corr(t.a, t.b))

    assert isinstance(expr, ir.TableExpr)

    result = expr.execute().sort_values('key')

    dfi = df.set_index('key')
    expected = pd.DataFrame({
        'key':
        list('def'),
        'my_corr':
        [dfi.loc[value, 'a'].corr(dfi.loc[value, 'b']) for value in 'def'],
    })

    columns = ['key', 'my_corr']
    tm.assert_frame_equal(result[columns], expected[columns])
Ejemplo n.º 4
0
def test_nullif_inf():
    df = pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]})
    con = Backend().connect({'t': df})
    t = con.table('t')
    expr = t.a.nullif(np.inf).nullif(-np.inf)
    result = expr.execute()
    expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name='a')
    tm.assert_series_equal(result, expected)
Ejemplo n.º 5
0
def test_project_list_scalar():
    df = pd.DataFrame({'ints': range(3)})
    con = Backend().connect({'df': df})
    expr = con.table('df')
    result = expr.mutate(res=expr.ints.quantile([0.5, 0.95])).execute()
    tm.assert_series_equal(
        result.res, pd.Series([[1.0, 1.9] for _ in range(0, 3)], name='res')
    )
Ejemplo n.º 6
0
def test_interval_arithmetic(op, expected):
    data = pd.timedelta_range('0 days', '10 days', freq='D')
    con = Backend().connect({
        'df1': pd.DataFrame({'td': data}),
        'df2': pd.DataFrame({'td': data})
    })
    t1 = con.table('df1')
    expr = op(t1.td, t1.td)
    result = expr.execute()
    expected = pd.Series(expected(data, data), name='td')
    tm.assert_series_equal(result, expected)
Ejemplo n.º 7
0
def test_multiple_argument_udaf_window():
    # PR 2035

    @udf.reduction(['double', 'double'], 'double')
    def my_wm(v, w):
        return np.average(v, weights=w)

    df = pd.DataFrame(
        {
            'a': np.arange(4, 0, dtype=float, step=-1).tolist()
            + np.random.rand(3).tolist(),
            'b': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'c': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'd': np.repeat(1, 7),
            'key': list('deefefd'),
        }
    )
    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_window(2, order_by='a', group_by='key')
    window2 = ibis.trailing_window(1, order_by='b', group_by='key')
    expr = t.mutate(
        wm_b=my_wm(t.b, t.d).over(window),
        wm_c=my_wm(t.c, t.d).over(window),
        wm_c2=my_wm(t.c, t.d).over(window2),
    )
    result = expr.execute().sort_values(['key', 'a'])
    expected = (
        df.sort_values(['key', 'a'])
        .assign(
            wm_b=lambda df: df.groupby('key')
            .b.rolling(3, min_periods=1)
            .mean()
            .reset_index(level=0, drop=True)
        )
        .assign(
            wm_c=lambda df: df.groupby('key')
            .c.rolling(3, min_periods=1)
            .mean()
            .reset_index(level=0, drop=True)
        )
    )
    expected = expected.sort_values(['key', 'b']).assign(
        wm_c2=lambda df: df.groupby('key')
        .c.rolling(2, min_periods=1)
        .mean()
        .reset_index(level=0, drop=True)
    )
    expected = expected.sort_values(['key', 'a'])

    tm.assert_frame_equal(result, expected)
Ejemplo n.º 8
0
def test_select_on_unambiguous_join(how, func):
    df_t = pd.DataFrame({'a0': [1, 2, 3], 'b1': list("aab")})
    df_s = pd.DataFrame({'a1': [2, 3, 4], 'b2': list("abc")})
    con = Backend().connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    method = getattr(t, f"{how}_join")
    join = method(s, t.b1 == s.b2)
    expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"],
                        how=how)[["a0", "a1"]]
    assert not expected.empty
    expr = func(join)
    result = expr.execute()
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 9
0
def test_udaf_window_nan():
    df = pd.DataFrame({
        'a': np.arange(10, dtype=float),
        'b': [3.0, np.NaN] * 5,
        'key': list('ddeefffggh'),
    })
    con = Backend().connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_window(2, order_by='a', group_by='key')
    expr = t.mutate(rolled=my_mean(t.b).over(window))
    result = expr.execute().sort_values(['key', 'a'])
    expected = df.sort_values(['key', 'a']).assign(
        rolled=lambda d: d.groupby('key').b.rolling(3, min_periods=1).apply(
            lambda x: x.mean(), raw=True).reset_index(level=0, drop=True))
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 10
0
def client(
    df,
    df1,
    df2,
    df3,
    time_df1,
    time_df2,
    time_df3,
    time_keyed_df1,
    time_keyed_df2,
    intersect_df2,
):
    return Backend().connect({
        'df': df,
        'df1': df1,
        'df2': df2,
        'df3': df3,
        'left': df1,
        'right': df2,
        'time_df1': time_df1,
        'time_df2': time_df2,
        'time_df3': time_df3,
        'time_keyed_df1': time_keyed_df1,
        'time_keyed_df2': time_keyed_df2,
        'intersect_df2': intersect_df2,
    })
Ejemplo n.º 11
0
def test_mutate_with_window_after_join(sort_kind):
    left_df = pd.DataFrame({
        'ints': [0, 1, 2],
        'strings': ['a', 'b', 'c'],
        'dates': pd.date_range('20170101', periods=3),
    })
    right_df = pd.DataFrame({
        'group': [0, 1, 2] * 3,
        'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8],
    })
    con = Backend().connect({'left': left_df, 'right': right_df})
    left, right = map(con.table, ('left', 'right'))

    joined = left.outer_join(right, left.ints == right.group)
    proj = joined[left, right.value]
    expr = proj.groupby('ints').mutate(sum=proj.value.sum())
    result = expr.execute()
    expected = pd.DataFrame({
        'dates':
        pd.concat([left_df.dates] *
                  3).sort_values(kind=sort_kind).reset_index(drop=True),
        'ints': [0] * 3 + [1] * 3 + [2] * 3,
        'strings': ['a'] * 3 + ['b'] * 3 + ['c'] * 3,
        'value': [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0],
        'sum': [9.0] * 3 + [12.0] * 3 + [8.0] * 3,
    })
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 12
0
def test_window_with_preceding_expr(index):
    time = pd.date_range('20180101', '20180110')
    start = 2
    data = np.arange(start, start + len(time))
    df = pd.DataFrame({'value': data, 'time': time}, index=index(time))
    client = Backend().connect({'df': df})
    t = client.table('df')
    expected = (
        df.set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
        .reset_index(drop=True)
    )
    expected.index.name = None
    day = ibis.interval(days=1)
    window = ibis.trailing_window(3 * day, order_by=t.time)
    expr = t.value.mean().over(window)
    result = expr.execute()
    tm.assert_series_equal(result, expected)
Ejemplo n.º 13
0
def test_select_on_unambiguous_asof_join(func):
    df_t = pd.DataFrame({
        'a0': [1, 2, 3],
        'b1': pd.date_range("20180101", periods=3)
    })
    df_s = pd.DataFrame({
        'a1': [2, 3, 4],
        'b2': pd.date_range("20171230", periods=3)
    })
    con = Backend().connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    join = t.asof_join(s, t.b1 == s.b2)
    expected = pd.merge_asof(df_t, df_s, left_on=["b1"],
                             right_on=["b2"])[["a0", "a1"]]
    assert not expected.empty
    expr = func(join)
    result = expr.execute()
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 14
0
def struct_client(value):
    df = pd.DataFrame({
        "s": [
            OrderedDict([("fruit", "apple"), ("weight", None)]),
            value,
            OrderedDict([("fruit", "pear"), ("weight", 1)]),
        ],
        "key":
        list("aab"),
        "value": [1, 2, 3],
    })
    return Backend().connect({"t": df})
Ejemplo n.º 15
0
def test_window_has_pre_execute_scope():
    signature = ops.Lag, Backend
    called = [0]

    @pre_execute.register(*signature)
    def test_pre_execute(op, client, **kwargs):
        called[0] += 1
        return Scope()

    data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')}
    df = pd.DataFrame(data, columns=['key', 'value', 'dup'])
    client = Backend().connect({'df': df})
    t = client.table('df')
    window = ibis.window(order_by='value')
    expr = t.key.lag(1).over(window).name('foo')
    result = expr.execute()
    assert result is not None

    # once in window op at the top to pickup any scope changes before computing
    # twice in window op when calling execute on the ops.Lag node at the
    # beginning of execute and once before the actual computation
    assert called[0] == 3
Ejemplo n.º 16
0
def test_from_dataframe(dataframe, ibis_table, core_client):
    t = Backend().from_dataframe(dataframe)
    result = t.execute()
    expected = ibis_table.execute()
    tm.assert_frame_equal(result, expected)

    t = Backend().from_dataframe(dataframe, name='foo')
    expected = ibis_table.execute()
    tm.assert_frame_equal(result, expected)

    client = core_client
    t = Backend().from_dataframe(dataframe, name='foo', client=client)
    expected = ibis_table.execute()
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 17
0
def test_project_scalar_after_join():
    left_df = pd.DataFrame({'ints': range(3)})
    right_df = pd.DataFrame(
        {
            'group': [0, 1, 2] * 3,
            'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8],
        }
    )
    con = Backend().connect({'left': left_df, 'right': right_df})
    left, right = map(con.table, ('left', 'right'))

    joined = left.outer_join(right, left.ints == right.group)
    proj = joined[left, right.value]
    expr = proj[proj.value.sum().name('sum'), ibis.literal(1).name('const')]
    result = expr.execute()
    expected = pd.DataFrame({'sum': [29.0] * 9, 'const': [1] * 9})
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 18
0
def test_mutate_scalar_with_window_after_join():
    left_df = pd.DataFrame({'ints': range(3)})
    right_df = pd.DataFrame({
        'group': [0, 1, 2] * 3,
        'value': [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8],
    })
    con = Backend().connect({'left': left_df, 'right': right_df})
    left, right = map(con.table, ('left', 'right'))

    joined = left.outer_join(right, left.ints == right.group)
    proj = joined[left, right.value]
    expr = proj.mutate(sum=proj.value.sum(), const=1)
    result = expr.execute()
    expected = pd.DataFrame({
        'ints': [0] * 3 + [1] * 3 + [2] * 3,
        'value': [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0],
        'sum': [29.0] * 9,
        'const': [1] * 9,
    })
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 19
0
def con(df, df2):
    return Backend().connect({'df': df, 'df2': df2})
Ejemplo n.º 20
0
def lahman(batting_df, awards_players_df):
    return Backend().connect({
        'batting': batting_df,
        'awards_players': awards_players_df
    })
Ejemplo n.º 21
0
def core_client(dataframe):
    return Backend().connect({'df': dataframe})