Beispiel #1
0
def test_udaf_window():
    @udf.reduction([dt.double], dt.double)
    def my_mean(series):
        return series.mean()

    df = pd.DataFrame(
        {
            'a': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'b': np.arange(4, dtype=float).tolist()
            + np.random.rand(3).tolist(),
            'key': list('ddeefff'),
        }
    )
    con = ibis.pandas.connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_window(2, order_by='a', group_by='key')
    expr = t.mutate(rolled=my_mean(t.b).over(window))
    result = expr.execute().sort_values(['key', 'a'])
    expected = df.sort_values(['key', 'a']).assign(
        rolled=lambda df: df.groupby('key')
        .b.rolling(2)
        .mean()
        .reset_index(level=0, drop=True)
    )
    tm.assert_frame_equal(result, expected)
Beispiel #2
0
def test_array_collect_rolling_partitioned(t, df):
    window = ibis.trailing_window(2, order_by=t.plain_int64)
    colexpr = t.plain_float64.collect().over(window)
    expr = t['dup_strings', 'plain_int64', colexpr.name('collected')]
    result = expr.execute()
    expected = pd.DataFrame(
        {
            'dup_strings': ['d', 'a', 'd'],
            'plain_int64': [1, 2, 3],
            'collected': [[4.0], [4.0, 5.0], [5.0, 6.0]],
        }
    )[expr.columns]
    tm.assert_frame_equal(result, expected)
Beispiel #3
0
    def test_interactive_repr_call_failure(self):
        t = self.con.table("tpch_lineitem").limit(100000)

        t = t[t, t.l_receiptdate.cast("timestamp").name("date")]

        keys = [t.date.year().name("year"), "l_linestatus"]
        filt = t.l_linestatus.isin(["F"])
        expr = t[filt].group_by(keys).aggregate(t.l_extendedprice.mean().name("avg_px"))

        w2 = ibis.trailing_window(9, group_by=expr.l_linestatus, order_by=expr.year)

        metric = expr["avg_px"].mean().over(w2)
        enriched = expr[expr, metric]
        with config.option_context("interactive", True):
            repr(enriched)
Beispiel #4
0
    def test_window_frame_specs(self):
        t = self.con.table('alltypes')

        ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
FROM alltypes"""

        cases = [
            (window(preceding=0),
             'range between current row and unbounded following'),

            (window(following=0),
             'range between unbounded preceding and current row'),

            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(preceding=5, following=0),
             'rows between 5 preceding and current row'),
            (window(preceding=5, following=2),
             'rows between 5 preceding and 2 following'),
            (window(following=2),
             'rows between unbounded preceding and 2 following'),
            (window(following=2, preceding=0),
             'rows between current row and 2 following'),
            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(following=[5, 10]),
             'rows between 5 following and 10 following'),
            (window(preceding=[10, 5]),
             'rows between 10 preceding and 5 preceding'),

            # # cumulative windows
            (ibis.cumulative_window(),
             'range between unbounded preceding and current row'),

            # # trailing windows
            (ibis.trailing_window(10),
             'rows between 10 preceding and current row'),
        ]

        for w, frame in cases:
            w2 = w.order_by(t.f)
            expr = t.projection([t.d.sum().over(w2).name('foo')])
            expected = ex_template.format(frame.upper())
            self._check_sql(expr, expected)
Beispiel #5
0
    def test_interactive_repr_call_failure(self):
        t = self.con.table('tpch_lineitem').limit(100000)

        t = t[t, t.l_receiptdate.cast('timestamp').name('date')]

        keys = [t.date.year().name('year'), 'l_linestatus']
        filt = t.l_linestatus.isin(['F'])
        expr = (t[filt]
                .group_by(keys)
                .aggregate(t.l_extendedprice.mean().name('avg_px')))

        w2 = ibis.trailing_window(9, group_by=expr.l_linestatus,
                                  order_by=expr.year)

        metric = expr['avg_px'].mean().over(w2)
        enriched = expr[expr, metric]
        with config.option_context('interactive', True):
            repr(enriched)
Beispiel #6
0
def test_batting_rolling(batting, batting_df, sort_kind):
    expr = batting.mutate(
        more_values=lambda t: t.G.sum().over(
            ibis.trailing_window(5, order_by=t.yearID)
        )
    )
    result = expr.execute()

    columns = ['G', 'yearID']
    more_values = (
        batting_df[columns]
        .sort_values('yearID', kind=sort_kind)
        .G.rolling(5)
        .sum()
    )
    expected = batting_df.assign(more_values=more_values)

    tm.assert_frame_equal(result[expected.columns], expected)
Beispiel #7
0
def test_window_with_preceding_expr():
    index = pd.date_range('20180101', '20180110')
    start = 2
    data = np.arange(start, start + len(index))
    df = pd.DataFrame({'value': data, 'time': index}, index=index)
    client = ibis.pandas.connect({'df': df})
    t = client.table('df')
    expected = (
        df.set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
        .reset_index(drop=True)
    )
    expected.index.name = None
    day = ibis.interval(days=1)
    window = ibis.trailing_window(3 * day, order_by=t.time)
    expr = t.value.mean().over(window)
    result = expr.execute()
    tm.assert_series_equal(result, expected)
Beispiel #8
0
def test_batting_rolling_partitioned(batting, batting_df, sort_kind):
    t = batting
    group_by = 'playerID'
    order_by = 'yearID'
    expr = t.G.sum().over(
        ibis.trailing_window(3, order_by=t[order_by], group_by=t[group_by])
    )
    expr = t.mutate(rolled=expr)
    result = expr.execute()

    columns = [group_by, order_by, 'G']
    expected = (
        batting_df[columns]
        .set_index(order_by)
        .groupby(group_by)
        .G.rolling(3)
        .sum()
        .rename('rolled')
    )

    tm.assert_series_equal(
        result.set_index([group_by, order_by]).sort_index().rolled,
        expected.sort_index(),
    )
Beispiel #9
0
        (
            window(following=[5, 10]),
            'rows between 5 following and 10 following',
        ),
        (
            window(preceding=[10, 5]),
            'rows between 10 preceding and 5 preceding',
        ),
        # # cumulative windows
        (
            ibis.cumulative_window(),
            'range between unbounded preceding and current row',
        ),
        # # trailing windows
        (
            ibis.trailing_window(10),
            'rows between 10 preceding and current row',
        ),
    ],
)
def test_window_frame_specs(con, window, frame):
    t = con.table('alltypes')

    ex_template = """\
SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo`
FROM ibis_testing.`alltypes`"""

    w2 = window.order_by(t.f)
    expr = t.projection([t.d.sum().over(w2).name('foo')])
    expected = ex_template.format(frame.upper())
    assert_sql_equal(expr, expected)
Beispiel #10
0
@pytest.mark.parametrize(
    'window_fn',
    [
        param(
            lambda t: ibis.window(
                preceding=2,
                following=0,
                group_by=[t.string_col],
                order_by=[t.id],
            ),
            id='preceding-2-following-0',
        ),
        param(
            lambda t: ibis.trailing_window(
                preceding=2, group_by=[t.string_col], order_by=[t.id]
            ),
            id='trailing-2',
        ),
    ],
)
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion"])
def test_grouped_bounded_preceding_window(backend, alltypes, df, window_fn):
    window = window_fn(alltypes)

    expr = alltypes.mutate(val=alltypes.double_col.sum().over(window))

    result = expr.execute().set_index('id').sort_index()
    gdf = df.sort_values('id').groupby('string_col')
    expected = (
        df.assign(
Beispiel #11
0
from pyspark.sql.window import Window
from pytest import param

import ibis

pytest.importorskip('pyspark')
pytestmark = pytest.mark.pyspark


@pytest.mark.parametrize(
    ('ibis_window', 'spark_range'),
    [
        param(
            ibis.trailing_window(
                preceding=ibis.interval(hours=1),
                order_by='time',
                group_by='key',
            ),
            (-3600, 0),
        ),
        param(
            ibis.trailing_window(
                preceding=ibis.interval(hours=2),
                order_by='time',
                group_by='key',
            ),
            (-7200, 0),
        ),
        param(
            ibis.range_window(
                preceding=0,
Beispiel #12
0
        (
            window(following=[5, 10]),
            'rows between 5 following and 10 following',
        ),
        (
            window(preceding=[10, 5]),
            'rows between 10 preceding and 5 preceding',
        ),
        # # cumulative windows
        (
            ibis.cumulative_window(),
            'range between unbounded preceding and current row',
        ),
        # # trailing windows
        (
            ibis.trailing_window(10),
            'rows between 10 preceding and current row',
        ),
    ],
)
def test_window_frame_specs(con, window, frame):
    t = con.table('alltypes')

    ex_template = """\
SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo`
FROM ibis_testing.`alltypes`"""

    w2 = window.order_by(t.f)
    expr = t.projection([t.d.sum().over(w2).name('foo')])
    expected = ex_template.format(frame.upper())
    assert_sql_equal(expr, expected)
Beispiel #13
0
    # These need to be tz-naive because the timestamp_col in
    # the test data is tz-naive
    return pd.Timestamp('20090105'), pd.Timestamp('20090111')


def filter_by_time_context(df, context):
    return df[(df['timestamp_col'] >= context[0])
              & (df['timestamp_col'] < context[1])]


@pytest.mark.only_on_backends(['pandas', 'pyspark'])
@pytest.mark.min_spark_version('3.1')
@pytest.mark.parametrize(
    'window',
    [
        ibis.trailing_window(ibis.interval(days=3), order_by=ORDERBY_COL),
        ibis.trailing_window(
            ibis.interval(days=3),
            order_by=ORDERBY_COL,
            group_by=GROUPBY_COL,
        ),
    ],
)
def test_context_adjustment_window_udf(alltypes, df, context, window):
    """ This test case aims to test context adjustment of
        udfs in window method.
    """
    with option_context('context_adjustment.time_col', 'timestamp_col'):
        expr = alltypes.mutate(v1=calc_mean(alltypes[TARGET_COL]).over(window))
        result = expr.execute(timecontext=context)