Ejemplo n.º 1
0
def test_batting_cumulative_partitioned(batting, batting_df, sort_kind):
    group_by = 'playerID'
    order_by = 'yearID'

    t = batting
    expr = t.G.sum().over(
        ibis.cumulative_window(order_by=order_by, group_by=group_by)
    )
    expr = t.mutate(cumulative=expr)
    result = expr.execute()

    columns = [group_by, order_by, 'G']
    expected = (
        batting_df[columns]
        .set_index(order_by)
        .groupby(group_by)
        .G.expanding()
        .sum()
        .rename('cumulative')
    )

    tm.assert_series_equal(
        result.set_index([group_by, order_by]).sort_index().cumulative,
        expected.sort_index(),
    )
Ejemplo n.º 2
0
def test_batting_most_hits(players, players_df):
    expr = players.mutate(hits_rank=lambda t: t.H.rank().over(
        ibis.cumulative_window(order_by=ibis.desc(t.H))))
    result = expr.execute()
    hits_rank = players_df.groupby('playerID').H.rank(method='min',
                                                      ascending=False)
    expected = players_df.assign(hits_rank=hits_rank)
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 3
0
def test_cumulative_simple_window(alltypes, func, df):
    t = alltypes
    f = getattr(t.double_col, func)
    col = t.double_col - f().over(ibis.cumulative_window())
    expr = t.projection([col.name('double_col')])
    result = expr.execute().double_col
    expected = df.double_col - getattr(df.double_col, 'cum%s' % func)()
    tm.assert_series_equal(result, expected)
Ejemplo n.º 4
0
def test_combine_preserves_existing_window():
    t = ibis.table(
        [('one', 'string'), ('two', 'double'), ('three', 'int32')],
        name='my_data',
    )
    w = ibis.cumulative_window(order_by=t.one)
    mut = t.group_by(t.three).mutate(four=t.two.sum().over(w))
    assert mut.op().selections[1].op().window.following == 0
Ejemplo n.º 5
0
 def test_cumulative_simple_window(self):
     t = self.alltypes
     df = t.execute()
     for func in 'sum min max'.split():
         f = getattr(t.double_col, func)
         expr = t.projection([(t.double_col - f().over(ibis.cumulative_window())).name('double_col')])
         result = expr.execute().double_col
         expected = df.double_col - getattr(df.double_col, 'cum%s' % func)()
         tm.assert_series_equal(result, expected)
Ejemplo n.º 6
0
 def test_cumulative_simple_window(self):
     t = self.alltypes
     df = t.execute()
     for func in 'sum min max'.split():
         f = getattr(t.double_col, func)
         expr = t.projection([(t.double_col - f().over(ibis.cumulative_window())).name('double_col')])
         result = expr.execute().double_col
         expected = df.double_col - getattr(df.double_col, 'cum%s' % func)()
         tm.assert_series_equal(result, expected)
Ejemplo n.º 7
0
def test_cumulative_ordered_window(alltypes, func, df):
    t = alltypes
    df = df.sort_values('timestamp_col').reset_index(drop=True)
    window = ibis.cumulative_window(order_by=t.timestamp_col)
    f = getattr(t.double_col, func)
    expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
    result = expr.execute().double_col
    expected = df.double_col - getattr(df.double_col, 'cum%s' % func)()
    tm.assert_series_equal(result, expected)
Ejemplo n.º 8
0
def _cumulative_to_window(expr, window):
    win = ibis.cumulative_window()
    win = (win.group_by(window._group_by).order_by(window._order_by))

    op = expr.op()

    klass = _cumulative_to_reduction[type(op)]
    new_op = klass(*op.args)
    return expr._factory(new_op, name=expr._name), win
Ejemplo n.º 9
0
 def test_cumulative_ordered_window(self):
     t = self.alltypes
     df = t.execute().sort_values('timestamp_col').reset_index(drop=True)
     window = ibis.cumulative_window(order_by=t.timestamp_col)
     for func in 'sum min max'.split():
         f = getattr(t.double_col, func)
         expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
         result = expr.execute().double_col
         expected = df.double_col - getattr(df.double_col, 'cum%s' % func)()
         tm.assert_series_equal(result, expected)
Ejemplo n.º 10
0
 def test_cumulative_partitioned_ordered_window(self):
     t = self.alltypes
     df = t.execute().sort_values(['string_col', 'timestamp_col']).reset_index(drop=True)
     window = ibis.cumulative_window(order_by=t.timestamp_col, group_by=t.string_col)
     for func in 'sum min max'.split():
         f = getattr(t.double_col, func)
         expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
         result = expr.execute().double_col
         expected = df.groupby(df.string_col).double_col.transform(lambda c: c - getattr(c, 'cum%s' % func)())
         tm.assert_series_equal(result, expected)
Ejemplo n.º 11
0
def test_batting_cumulative(batting, batting_df, sort_kind):
    expr = batting.mutate(more_values=lambda t: t.G.sum().over(
        ibis.cumulative_window(order_by=t.yearID)))
    result = expr.execute()

    columns = ['G', 'yearID']
    more_values = (batting_df[columns].sort_values(
        'yearID', kind=sort_kind).G.expanding().sum().astype('int64'))
    expected = batting_df.assign(more_values=more_values)
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 12
0
def _cumulative_to_window(expr, window):
    win = ibis.cumulative_window()
    win = (win.group_by(window._group_by)
           .order_by(window._order_by))

    op = expr.op()

    klass = _cumulative_to_reduction[type(op)]
    new_op = klass(*op.args)
    return expr._factory(new_op, name=expr._name), win
Ejemplo n.º 13
0
def test_cumulative_partitioned_window(alltypes, func, df):
    t = alltypes
    df = df.sort_values('string_col').reset_index(drop=True)
    window = ibis.cumulative_window(group_by=t.string_col)
    f = getattr(t.double_col, func)
    expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
    result = expr.execute().double_col
    expected = df.groupby(df.string_col).double_col.transform(
        lambda c: c - getattr(c, 'cum%s' % func)())
    tm.assert_series_equal(result, expected)
Ejemplo n.º 14
0
 def test_cumulative_partitioned_ordered_window(self):
     t = self.alltypes
     df = t.execute().sort_values(['string_col', 'timestamp_col']).reset_index(drop=True)
     window = ibis.cumulative_window(order_by=t.timestamp_col, group_by=t.string_col)
     for func in 'sum min max'.split():
         f = getattr(t.double_col, func)
         expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
         result = expr.execute().double_col
         expected = df.groupby(df.string_col).double_col.transform(lambda c: c - getattr(c, 'cum%s' % func)())
         tm.assert_series_equal(result, expected)
Ejemplo n.º 15
0
def test_batting_most_hits(players, players_df):
    expr = players.mutate(
        hits_rank=lambda t: t.H.rank().over(
            ibis.cumulative_window(order_by=ibis.desc(t.H))
        )
    )
    result = expr.execute()
    hits_rank = players_df.groupby('playerID').H.rank(
        method='min', ascending=False
    )
    expected = players_df.assign(hits_rank=hits_rank)
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 16
0
def test_cumulative_functions(alltypes, cumulative, static):
    t = alltypes

    w = ibis.window(order_by=t.d)

    actual = cumulative(t, w).name('foo')
    expected = static(t, w).over(ibis.cumulative_window()).name('foo')

    expr1 = t.projection(actual)
    expr2 = t.projection(expected)

    assert to_sql(expr1) == to_sql(expr2)
Ejemplo n.º 17
0
def test_cumulative_functions(alltypes, cumulative, static):
    t = alltypes

    w = ibis.window(order_by=t.d)

    actual = cumulative(t, w).name('foo')
    expected = static(t, w).over(ibis.cumulative_window()).name('foo')

    expr1 = t.projection(actual)
    expr2 = t.projection(expected)

    assert ImpalaCompiler.to_sql(expr1) == ImpalaCompiler.to_sql(expr2)
Ejemplo n.º 18
0
def test_cumulative_partitioned_ordered_window(alltypes, func, df):
    t = alltypes
    df = df.sort_values(['string_col', 'timestamp_col']).reset_index(drop=True)
    window = ibis.cumulative_window(order_by=t.timestamp_col,
                                    group_by=t.string_col)
    f = getattr(t.double_col, func)
    expr = t.projection([(t.double_col - f().over(window)).name('double_col')])
    result = expr.execute().double_col
    method = operator.methodcaller(f'cum{func}')
    expected = df.groupby(
        df.string_col).double_col.transform(lambda c: c - method(c))
    tm.assert_series_equal(result, expected)
Ejemplo n.º 19
0
def test_group_by_with_window_preserves_range():
    t = ibis.table(
        [('one', 'string'), ('two', 'double'), ('three', 'int32')],
        name='my_data',
    )
    w = ibis.cumulative_window(order_by=t.one)
    expr = t.group_by(t.three).mutate(four=t.two.sum().over(w))
    result = ibis.impala.compile(expr)
    expected = """\
SELECT *,
       sum(`two`) OVER (PARTITION BY `three` ORDER BY `one` RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS `four`
FROM my_data"""  # noqa: E501
    assert result == expected
Ejemplo n.º 20
0
def _cumulative_to_window(translator, expr, window):
    win = ibis.cumulative_window()
    win = win.group_by(window._group_by).order_by(window._order_by)

    op = expr.op()

    klass = _cumulative_to_reduction[type(op)]
    new_op = klass(*op.args)
    new_expr = expr._factory(new_op, name=expr._name)

    if type(new_op) in translator._rewrites:
        new_expr = translator._rewrites[type(new_op)](new_expr)

    new_expr = L.windowize_function(new_expr, win)
    return new_expr
Ejemplo n.º 21
0
def _cumulative_to_window(translator, expr, window):
    win = ibis.cumulative_window()
    win = win.group_by(window._group_by).order_by(window._order_by)

    op = expr.op()

    klass = _cumulative_to_reduction[type(op)]
    new_op = klass(*op.args)
    new_expr = expr._factory(new_op, name=expr._name)

    if type(new_op) in translator._rewrites:
        new_expr = translator._rewrites[type(new_op)](new_expr)

    new_expr = L.windowize_function(new_expr, win)
    return new_expr
Ejemplo n.º 22
0
def test_window_function(time_data):
    my_table = query("""SELECT count,
       duration_seconds,
       SUM(duration_seconds) OVER
         (PARTITION BY person) AS running_total,
       COUNT(duration_seconds) OVER
         (PARTITION BY person) AS running_count,
       AVG(duration_seconds) OVER
         (PARTITION BY person) AS running_avg
  FROM time_data""").execute()
    ibis_table = time_data.projection([
        time_data.get_column("count"),
        time_data.duration_seconds,
        time_data.duration_seconds.sum().over(
            ibis.cumulative_window(
                group_by=time_data.person)).name("running_total"),
        time_data.duration_seconds.count().over(
            ibis.cumulative_window(
                group_by=time_data.person)).name("running_count"),
        time_data.duration_seconds.mean().over(
            ibis.cumulative_window(
                group_by=time_data.person)).name("running_avg"),
    ]).execute()
    assert_frame_equal(ibis_table, my_table)
Ejemplo n.º 23
0
def test_batting_cumulative_partitioned(batting, batting_df):
    expr = batting.mutate(more_values=lambda t: t.G.sum().over(
        ibis.cumulative_window(order_by=t.yearID, group_by=t.lgID)))
    result = expr.execute().more_values

    columns = ['G', 'yearID', 'lgID']
    key = 'lgID'
    expected_result = batting_df[columns].groupby(
        key, sort=False,
        as_index=False).apply(lambda df: df.sort_values('yearID')).groupby(
            key, sort=False).G.cumsum().sort_index(level=-1)
    expected = expected_result.reset_index(list(
        range(expected_result.index.nlevels - 1)),
                                           drop=True).reindex(batting_df.index)
    expected.name = result.name

    tm.assert_series_equal(result, expected)
Ejemplo n.º 24
0
def test_batting_cumulative_partitioned(batting, batting_df, sort_kind):
    group_by = 'playerID'
    order_by = 'yearID'

    t = batting
    expr = t.G.sum().over(
        ibis.cumulative_window(order_by=order_by, group_by=group_by))
    expr = t.mutate(cumulative=expr)
    result = expr.execute()

    columns = [group_by, order_by, 'G']
    expected = batting_df[columns].set_index(
        order_by).groupby(group_by).G.expanding().sum().rename('cumulative')

    tm.assert_series_equal(
        result.set_index([group_by, order_by]).sort_index().cumulative,
        expected.sort_index())
Ejemplo n.º 25
0
def cumulative_to_window(translator, expr, window):
    win = ibis.cumulative_window()
    win = win.group_by(window._group_by).order_by(window._order_by)

    op = expr.op()

    klass = _cumulative_to_reduction[type(op)]
    new_op = klass(*op.args)
    new_expr = new_op.to_expr()
    if expr.has_name():
        new_expr = new_expr.name(expr.get_name())

    if type(new_op) in translator._rewrites:
        new_expr = translator._rewrites[type(new_op)](new_expr)

    new_expr = L.windowize_function(new_expr, win)
    return new_expr
Ejemplo n.º 26
0
    def test_window_frame_specs(self):
        t = self.con.table('alltypes')

        ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
FROM alltypes"""

        cases = [
            (window(preceding=0),
             'range between current row and unbounded following'),

            (window(following=0),
             'range between unbounded preceding and current row'),

            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(preceding=5, following=0),
             'rows between 5 preceding and current row'),
            (window(preceding=5, following=2),
             'rows between 5 preceding and 2 following'),
            (window(following=2),
             'rows between unbounded preceding and 2 following'),
            (window(following=2, preceding=0),
             'rows between current row and 2 following'),
            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(following=[5, 10]),
             'rows between 5 following and 10 following'),
            (window(preceding=[10, 5]),
             'rows between 10 preceding and 5 preceding'),

            # # cumulative windows
            (ibis.cumulative_window(),
             'range between unbounded preceding and current row'),

            # # trailing windows
            (ibis.trailing_window(10),
             'rows between 10 preceding and current row'),
        ]

        for w, frame in cases:
            w2 = w.order_by(t.f)
            expr = t.projection([t.d.sum().over(w2).name('foo')])
            expected = ex_template.format(frame.upper())
            self._check_sql(expr, expected)
Ejemplo n.º 27
0
def test_batting_cumulative(batting, batting_df, sort_kind):
    expr = batting.mutate(
        more_values=lambda t: t.G.sum().over(
            ibis.cumulative_window(order_by=t.yearID)
        )
    )
    result = expr.execute()

    columns = ['G', 'yearID']
    more_values = (
        batting_df[columns]
        .sort_values('yearID', kind=sort_kind)
        .G.expanding()
        .sum()
    )
    expected = batting_df.assign(more_values=more_values)

    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 28
0
def test_complex_window(client):
    """ Test window with different sizes
        mix context adjustment for window op that require context
        adjustment and non window op that doesn't adjust context
    """
    table = client.table('time_indexed_table')
    context = (
        pd.Timestamp('20170102 07:00:00', tz='UTC'),
        pd.Timestamp('20170105', tz='UTC'),
    )
    window = ibis.trailing_window(preceding=ibis.interval(hours=1),
                                  order_by='time',
                                  group_by='key')
    window2 = ibis.trailing_window(preceding=ibis.interval(hours=2),
                                   order_by='time',
                                   group_by='key')
    window_cum = ibis.cumulative_window(order_by='time', group_by='key')
    # context should be adjusted accordingly for each window
    result_pd = (table.mutate(
        count_1h=table['value'].count().over(window),
        count_2h=table['value'].count().over(window2),
        count_cum=table['value'].count().over(window_cum),
    ).mutate(count=table['value'].count()).execute(timecontext=context))

    df = table.execute()
    expected_win_1h = (df.set_index('time').groupby('key').value.rolling(
        '1h', closed='both').count().rename('count_1h').astype(int))
    expected_win_2h = (df.set_index('time').groupby('key').value.rolling(
        '2h', closed='both').count().rename('count_2h').astype(int))
    expected_cum_win = (df.set_index('time').groupby(
        'key').value.expanding().count().rename('count_cum').astype(int))
    df = df.set_index('time')
    df = df.assign(count_1h=expected_win_1h.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df = df.assign(count_2h=expected_win_2h.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df = df.assign(count_cum=expected_cum_win.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df['count'] = df.groupby(['key'])['value'].transform('count')
    df = df.reset_index()
    expected = (df[df.time.between(*(t.tz_convert(None)
                                     for t in context))].sort_values(
                                         ['key']).reset_index(drop=True))
    tm.assert_frame_equal(result_pd, expected)
Ejemplo n.º 29
0
def test_cumulative_functions(con):
    t = con.table('alltypes')

    w = ibis.window(order_by=t.d)
    exprs = [
        (t.f.cumsum().over(w), t.f.sum().over(w)),
        (t.f.cummin().over(w), t.f.min().over(w)),
        (t.f.cummax().over(w), t.f.max().over(w)),
        (t.f.cummean().over(w), t.f.mean().over(w)),
    ]

    for cumulative, static in exprs:
        actual = cumulative.name('foo')
        expected = static.over(ibis.cumulative_window()).name('foo')

        expr1 = t.projection(actual)
        expr2 = t.projection(expected)

        assert to_sql(expr1) == to_sql(expr2)
Ejemplo n.º 30
0
    def test_window_frame_specs(self):
        t = self.con.table('alltypes')

        ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
FROM alltypes"""

        cases = [
            (window(preceding=0),
             'range between current row and unbounded following'),
            (window(following=0),
             'range between unbounded preceding and current row'),
            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(preceding=5,
                    following=0), 'rows between 5 preceding and current row'),
            (window(preceding=5,
                    following=2), 'rows between 5 preceding and 2 following'),
            (window(following=2),
             'rows between unbounded preceding and 2 following'),
            (window(following=2,
                    preceding=0), 'rows between current row and 2 following'),
            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(following=[5, 10]),
             'rows between 5 following and 10 following'),
            (window(preceding=[10, 5]),
             'rows between 10 preceding and 5 preceding'),

            # # cumulative windows
            (ibis.cumulative_window(),
             'range between unbounded preceding and current row'),

            # # trailing windows
            (ibis.trailing_window(10),
             'rows between 10 preceding and current row'),
        ]

        for w, frame in cases:
            w2 = w.order_by(t.f)
            expr = t.projection([t.d.sum().over(w2).name('foo')])
            expected = ex_template.format(frame.upper())
            self._check_sql(expr, expected)
Ejemplo n.º 31
0
    def test_cumulative_functions(self):
        t = self.con.table('alltypes')

        w = ibis.window(order_by=t.d)
        exprs = [
            (t.f.cumsum().over(w), t.f.sum().over(w)),
            (t.f.cummin().over(w), t.f.min().over(w)),
            (t.f.cummax().over(w), t.f.max().over(w)),
            (t.f.cummean().over(w), t.f.mean().over(w)),
        ]

        for cumulative, static in exprs:
            actual = cumulative.name('foo')
            expected = static.over(ibis.cumulative_window()).name('foo')

            expr1 = t.projection(actual)
            expr2 = t.projection(expected)

            self._compare_sql(expr1, expr2)
Ejemplo n.º 32
0
        ),
        (
            window(preceding=5),
            'rows between 5 preceding and unbounded following',
        ),
        (
            window(following=[5, 10]),
            'rows between 5 following and 10 following',
        ),
        (
            window(preceding=[10, 5]),
            'rows between 10 preceding and 5 preceding',
        ),
        # # cumulative windows
        (
            ibis.cumulative_window(),
            'range between unbounded preceding and current row',
        ),
        # # trailing windows
        (
            ibis.trailing_window(10),
            'rows between 10 preceding and current row',
        ),
    ],
)
def test_window_frame_specs(con, window, frame):
    t = con.table('alltypes')

    ex_template = """\
SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo`
FROM ibis_testing.`alltypes`"""
Ejemplo n.º 33
0
        ),
        (
            window(preceding=5),
            'rows between 5 preceding and unbounded following',
        ),
        (
            window(following=[5, 10]),
            'rows between 5 following and 10 following',
        ),
        (
            window(preceding=[10, 5]),
            'rows between 10 preceding and 5 preceding',
        ),
        # # cumulative windows
        (
            ibis.cumulative_window(),
            'range between unbounded preceding and current row',
        ),
        # # trailing windows
        (
            ibis.trailing_window(10),
            'rows between 10 preceding and current row',
        ),
    ],
)
def test_window_frame_specs(con, window, frame):
    t = con.table('alltypes')

    ex_template = """\
SELECT sum(`d`) OVER (ORDER BY `f` {0}) AS `foo`
FROM ibis_testing.`alltypes`"""