Ejemplo n.º 1
0
def test_combine_window_with_max_lookback():
    w1 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5)))
    w2 = ibis.trailing_window(rows_with_max_lookback(5, ibis.interval(days=7)))
    w3 = w1.combine(w2)
    expected = ibis.trailing_window(
        rows_with_max_lookback(3, ibis.interval(days=5)))
    assert_equal(w3, expected)
Ejemplo n.º 2
0
def test_scope_look_up():
    # test if scope could lookup items properly
    scope = Scope()
    one_day = ibis.interval(days=1).op()
    one_hour = ibis.interval(hours=1).op()
    scope = scope.merge_scope(Scope({one_day: 1}, None))
    assert scope.get_value(one_hour) is None
    assert scope.get_value(one_day) is not None
Ejemplo n.º 3
0
def test_timestamp_scalar_in_filter(alltypes, translate):
    table = alltypes

    expr = table.filter([
        table.timestamp_col <
        (ibis.timestamp('2010-01-01') + ibis.interval(weeks=3)),
        table.timestamp_col < (ibis.now() + ibis.interval(days=10)),
    ]).count()
    expr.execute()
Ejemplo n.º 4
0
    def setup(self):
        n = 30 * int(2e5)
        data = pd.DataFrame(
            {
                'key': np.random.choice(16000, size=n),
                'low_card_key': np.random.choice(30, size=n),
                'value': np.random.rand(n),
                'timestamps': pd.date_range(
                    start='now', periods=n, freq='s'
                ).values,
                'timestamp_strings': pd.date_range(
                    start='now', periods=n, freq='s'
                ).values.astype(str),
                'repeated_timestamps': pd.date_range(
                    start='2018-09-01', periods=30
                ).repeat(int(n / 30)),
            }
        )

        t = ibis.pandas.connect({'df': data}).table('df')

        self.high_card_group_by = t.groupby(t.key).aggregate(
            avg_value=t.value.mean()
        )

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (
            t.mutate(dates=t.timestamps.cast('date'))
            .groupby(['low_card_key', 'dates'])
            .aggregate(avg_value=lambda t: t.value.mean())
        )

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[
            ['low_card_key', 'key', 'value']
        ].sort_by(['low_card_key', 'key'])

        low_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

        high_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
Ejemplo n.º 5
0
def test_timestamp_scalar_in_filter(alltypes):
    # #310
    table = alltypes

    expr = table.filter([
        table.timestamp_col <
        (ibis.timestamp('2010-01-01') + ibis.interval(months=3)),
        table.timestamp_col < (ibis.now() + ibis.interval(days=10)),
    ]).count()
    expr.execute()
Ejemplo n.º 6
0
def test_timestamp_scalar_in_filter(alltypes, translate):
    table = alltypes

    expr = table.filter(
        [
            table.timestamp_col
            < (ibis.timestamp('2010-01-01') + ibis.interval(weeks=3)),
            table.timestamp_col < (ibis.now() + ibis.interval(days=10)),
        ]
    ).count()
    expr.execute()
Ejemplo n.º 7
0
def test_literal_equality_interval():
    a = ibis.interval(seconds=1).op()
    b = ibis.interval(minutes=1).op()

    assert a != b

    # Currently these does't equal, but perhaps should be?
    c = ibis.interval(seconds=60).op()
    d = ibis.interval(minutes=1).op()

    assert c != d
Ejemplo n.º 8
0
def test_decimal_timestamp_builtins(con):
    table = con.table('tpch_lineitem')

    dc = table.l_quantity
    ts = table.l_receiptdate.cast('timestamp')

    exprs = [
        dc % 10,
        dc + 5,
        dc + dc,
        dc / 2,
        dc * 2,
        dc**2,
        dc.cast('double'),
        api.where(table.l_discount > 0, dc * table.l_discount, api.NA),
        dc.fillna(0),
        ts < (ibis.now() + ibis.interval(months=3)),
        ts < (ibis.timestamp('2005-01-01') + ibis.interval(months=3)),
        # hashing
        dc.hash(),
        ts.hash(),
        # truncate
        ts.truncate('y'),
        ts.truncate('q'),
        ts.truncate('month'),
        ts.truncate('d'),
        ts.truncate('w'),
        ts.truncate('h'),
        ts.truncate('minute'),
    ]

    timestamp_fields = [
        'years',
        'months',
        'days',
        'hours',
        'minutes',
        'seconds',
        'weeks',
    ]
    for field in timestamp_fields:
        if hasattr(ts, field):
            exprs.append(getattr(ts, field)())

        offset = ibis.interval(**{field: 2})
        exprs.append(ts + offset)
        exprs.append(ts - offset)

    proj_exprs = [expr.name('e%d' % i) for i, expr in enumerate(exprs)]

    projection = table[proj_exprs].limit(10)
    projection.execute()
Ejemplo n.º 9
0
    def setup(self):
        n = 30 * int(2e5)
        data = pd.DataFrame({
            'key':
            np.random.choice(16000, size=n),
            'low_card_key':
            np.random.choice(30, size=n),
            'value':
            np.random.rand(n),
            'timestamps':
            pd.date_range(start='now', periods=n, freq='s').values,
            'timestamp_strings':
            pd.date_range(start='now', periods=n, freq='s').values.astype(str),
            'repeated_timestamps':
            pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)),
        })

        t = ibis.pandas.connect({'df': data}).table('df')

        self.high_card_group_by = t.groupby(
            t.key).aggregate(avg_value=t.value.mean())

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (t.mutate(
            dates=t.timestamps.cast('date')).groupby(
                ['low_card_key',
                 'dates']).aggregate(avg_value=lambda t: t.value.mean()))

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[['low_card_key', 'key', 'value'
                                           ]].sort_by(['low_card_key', 'key'])

        low_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

        high_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
Ejemplo n.º 10
0
def test_setting_timecontext_in_scope(time_table, time_df3):
    expected_win_1 = (
        time_df3.compute()
        .set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
    )
    expected_win_1 = expected_win_1[
        expected_win_1.index >= Timestamp('20170105')
    ].reset_index(drop=True)

    context = Timestamp('20170105'), Timestamp('20170111')
    window1 = ibis.trailing_window(
        3 * ibis.interval(days=1), order_by=time_table.time
    )
    """
    In the following expression, Selection node will be executed first and
    get table in context ('20170105', '20170101'). Then in window execution
    table will be executed again with a larger context adjusted by window
    preceeding days ('20170102', '20170111'). To get the correct result,
    the cached table result with a smaller context must be discard and updated
    to a larger time range.
    """
    expr = time_table.mutate(value=time_table['value'].mean().over(window1))
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result["value"], expected_win_1)
Ejemplo n.º 11
0
def test_window_rows_with_max_lookback(con):
    t = con.table('alltypes')
    mlb = rows_with_max_lookback(3, ibis.interval(days=3))
    w = ibis.trailing_window(mlb, order_by=t.i)
    expr = t.a.sum().over(w)
    with pytest.raises(NotImplementedError):
        ImpalaCompiler.to_sql(expr)
Ejemplo n.º 12
0
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right,
                                      time_keyed_df1, time_keyed_df2):
    expr = time_keyed_left.asof_join(
        time_keyed_right,
        'time',
        by='key',
        tolerance=4 * ibis.interval(days=1))[time_keyed_left,
                                             time_keyed_right.other_value]
    context = (Timestamp('20170105'), Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # compare with asof_join of manually trimmed tables
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]]
    trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] -
                                 Timedelta(days=4)][
                                     time_keyed_df2['time'] < context[1]]
    expected = dd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('4D'),
    ).compute()
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 13
0
def test_context_adjustment_window_groupby_id(time_table, time_df3):
    """This test case is meant to test trim_window_result method
    in dask/execution/window.py to see if it could trim Series
    correctly with groupby params
    """
    expected = (
        time_df3.compute()
        .set_index('time')
        .groupby('id')
        .value.rolling('3d', closed='both')
        .mean()
    )
    # This is a MultiIndexed Series
    expected = expected.reset_index()
    expected = expected[expected.time >= Timestamp('20170105')].reset_index(
        drop=True
    )['value']

    context = Timestamp('20170105'), Timestamp('20170111')

    # expected.index.name = None
    window = ibis.trailing_window(
        3 * ibis.interval(days=1), group_by='id', order_by=time_table.time
    )
    expr = time_table['value'].mean().over(window)
    # result should adjust time context accordingly
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result, expected)
Ejemplo n.º 14
0
def test_udaf_window_interval():
    df = pd.DataFrame(
        collections.OrderedDict([
            (
                "time",
                pd.date_range(start='20190105', end='20190101', freq='-1D'),
            ),
            ("key", [1, 2, 1, 2, 1]),
            ("value", np.arange(5)),
        ]))

    con = connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_range_window(ibis.interval(days=2),
                                        order_by='time',
                                        group_by='key')

    expr = t.mutate(rolled=my_mean(t.value).over(window))

    result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True)
    expected = (df.sort_values(['time', 'key']).set_index('time').assign(
        rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both').
        mean().reset_index(level=0, drop=True))).reset_index(drop=False)

    tm.assert_frame_equal(result, expected)
Ejemplo n.º 15
0
def test_window_with_mlb():
    index = pd.date_range('20170501', '20170507')
    data = np.random.randn(len(index), 3)
    df = (
        pd.DataFrame(data, columns=list('abc'), index=index)
        .rename_axis('time')
        .reset_index(drop=False)
    )
    client = Backend().connect({'df': df})
    t = client.table('df')
    rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10))
    expr = t.mutate(
        sum=lambda df: df.a.sum().over(
            ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b')
        )
    )
    result = expr.execute()
    expected = df.set_index('time')
    gb_df = (
        expected.groupby(['b'])['a']
        .rolling('10d', closed='both')
        .apply(lambda s: s.iloc[-5:].sum(), raw=False)
        .sort_index(level=['time'])
        .reset_index(drop=True)
    )
    expected = expected.reset_index(drop=False).assign(sum=gb_df)
    tm.assert_frame_equal(result, expected)

    rows_with_mlb = rows_with_max_lookback(5, 10)
    with pytest.raises(com.IbisInputError):
        t.mutate(
            sum=lambda df: df.a.sum().over(
                ibis.trailing_window(rows_with_mlb, order_by='time')
            )
        )
Ejemplo n.º 16
0
def prep_311_data(file):
    catalog = intake_civis.open_redshift_catalog()
    expr = catalog.public.import311.to_ibis()
    recent_srs = expr[(expr.createddate >
                       (ibis.now() - ibis.interval(months=6)))
                      & (expr.requesttype != "Homeless Encampment")]
    df = recent_srs.execute()
    df.to_csv(file, index=False)
Ejemplo n.º 17
0
def test_where_analyze_scalar_op(functional_alltypes):
    # root cause of #310
    table = functional_alltypes

    expr = table.filter([
        table.timestamp_col <
        (ibis.timestamp('2010-01-01') + ibis.interval(months=3)),
        table.timestamp_col < (ibis.now() + ibis.interval(days=10)),
    ]).count()

    result = Compiler.to_sql(expr)
    expected = """\
SELECT count(*) AS `count`
FROM functional_alltypes
WHERE (`timestamp_col` < date_add(cast({} as timestamp), INTERVAL 3 MONTH)) AND
      (`timestamp_col` < date_add(cast(now() as timestamp), INTERVAL 10 DAY))"""  # noqa: E501
    assert result == expected.format("'2010-01-01 00:00:00'")
Ejemplo n.º 18
0
def test_adjust_context_complete_shift(
    time_keyed_left,
    time_keyed_right,
    time_keyed_df1,
    time_keyed_df2,
):
    """Test `adjust_context` function that completely shifts the context.

    This results in an adjusted context that is NOT a subset of the
    original context. This is unlike an `adjust_context` function
    that only expands the context.

    See #3104
    """

    # Create a contrived `adjust_context` function for
    # CustomAsOfJoin to mock this.

    @adjust_context.register(CustomAsOfJoin)
    def adjust_context_custom_asof_join(
        op: ops.AsOfJoin,
        timecontext: TimeContext,
        scope: Optional[Scope] = None,
    ) -> TimeContext:
        """Shifts both the begin and end in the same direction."""

        begin, end = timecontext
        timedelta = execute(op.tolerance)
        return (begin - timedelta, end - timedelta)

    expr = CustomAsOfJoin(
        left=time_keyed_left,
        right=time_keyed_right,
        predicates='time',
        by='key',
        tolerance=ibis.interval(days=4),
    ).to_expr()
    expr = expr[time_keyed_left, time_keyed_right.other_value]
    context = (pd.Timestamp('20170101'), pd.Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # Compare with asof_join of manually trimmed tables
    # Left table: No shift for context
    # Right table: Shift both begin and end of context by 4 days
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]]
    trimmed_df2 = time_keyed_df2[
        time_keyed_df2['time'] >= context[0] - pd.Timedelta(days=4)][
            time_keyed_df2['time'] < context[1] - pd.Timedelta(days=4)]
    expected = pd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=pd.Timedelta('4D'),
    )

    tm.assert_frame_equal(result, expected)
Ejemplo n.º 19
0
def test_rolling_window_with_mlb(alltypes):
    t = alltypes
    window = ibis.trailing_window(
        preceding=rows_with_max_lookback(3, ibis.interval(days=5)),
        order_by=t.timestamp_col,
    )
    expr = t['double_col'].sum().over(window)
    with pytest.raises(NotImplementedError):
        expr.execute()
Ejemplo n.º 20
0
def test_complex_window(client):
    """ Test window with different sizes
        mix context adjustment for window op that require context
        adjustment and non window op that doesn't adjust context
    """
    table = client.table('time_indexed_table')
    context = (
        pd.Timestamp('20170102 07:00:00', tz='UTC'),
        pd.Timestamp('20170105', tz='UTC'),
    )
    window = ibis.trailing_window(preceding=ibis.interval(hours=1),
                                  order_by='time',
                                  group_by='key')
    window2 = ibis.trailing_window(preceding=ibis.interval(hours=2),
                                   order_by='time',
                                   group_by='key')
    window_cum = ibis.cumulative_window(order_by='time', group_by='key')
    # context should be adjusted accordingly for each window
    result_pd = (table.mutate(
        count_1h=table['value'].count().over(window),
        count_2h=table['value'].count().over(window2),
        count_cum=table['value'].count().over(window_cum),
    ).mutate(count=table['value'].count()).execute(timecontext=context))

    df = table.execute()
    expected_win_1h = (df.set_index('time').groupby('key').value.rolling(
        '1h', closed='both').count().rename('count_1h').astype(int))
    expected_win_2h = (df.set_index('time').groupby('key').value.rolling(
        '2h', closed='both').count().rename('count_2h').astype(int))
    expected_cum_win = (df.set_index('time').groupby(
        'key').value.expanding().count().rename('count_cum').astype(int))
    df = df.set_index('time')
    df = df.assign(count_1h=expected_win_1h.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df = df.assign(count_2h=expected_win_2h.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df = df.assign(count_cum=expected_cum_win.sort_index(
        level=['time', 'key']).reset_index(level='key', drop=True))
    df['count'] = df.groupby(['key'])['value'].transform('count')
    df = df.reset_index()
    expected = (df[df.time.between(*(t.tz_convert(None)
                                     for t in context))].sort_values(
                                         ['key']).reset_index(drop=True))
    tm.assert_frame_equal(result_pd, expected)
Ejemplo n.º 21
0
def timeunit(transform: dict, expr: ibis.Expr) -> ibis.Expr:
    """
    Apply a vega time unit transform to an ibis expression.
    https://vega.github.io/vega/docs/transforms/timeunit/

    It transforms it into the Ibis truncate expression.
    https://docs.ibis-project.org/generated/ibis.expr.api.TimestampValue.truncate.html

    Parameters
    ----------
    transform: dict
        A JSON-able dictionary representing the vega transform.
    expr: ibis.Expr
        The expression to which to apply the transform.

    Returns
    -------
    transformed_expr: the transformed expression
    """
    assert transform.pop("type") == "timeunit"
    field = expr[transform.pop("field")]
    as_start, as_end = transform.pop("as")
    units = transform.pop("units")
    if transform:
        raise NotImplementedError(
            f"timeunit transform: {list(transform)} keys are not supported")
    if units == ["year"]:
        start = field.truncate("Y")
        delta = ibis.interval(years=1)
    elif units == ["year", "month"]:
        start = field.truncate("M")
        delta = ibis.interval(months=1)
    elif units == ["year", "month", "date"]:
        start = field.truncate("D")
        delta = ibis.interval(days=1)
    elif units == ["year", "month", "date", "hours"]:
        start = field.truncate("h")
        delta = ibis.interval(hours=1)
    elif units == ["year", "month", "date", "hours", "minutes"]:
        start = field.truncate("m")
        delta = ibis.interval(minutes=1)
    elif units == ["year", "month", "date", "hours", "minutes", "seconds"]:
        start = field.truncate("s")
        delta = ibis.interval(seconds=1)
    elif units == [
            "year",
            "month",
            "date",
            "hours",
            "minutes",
            "seconds",
            "milliseconds",
    ]:
        start = field.truncate("ms")
        delta = ibis.interval(milliseconds=1)
    else:
        raise NotImplementedError(
            f"timeunit transform: {units} units are not supported")
    return expr.mutate([start.name(as_start), (start + delta).name(as_end)])
Ejemplo n.º 22
0
def test_multiple_windows(client):
    table = client.table('time_indexed_table')
    window1 = ibis.trailing_window(preceding=ibis.interval(hours=1),
                                   order_by='time',
                                   group_by='key')
    window2 = ibis.trailing_window(preceding=ibis.interval(hours=2),
                                   order_by='time',
                                   group_by='key')
    result = table.mutate(
        mean_1h=table['value'].mean().over(window1),
        mean_2h=table['value'].mean().over(window2),
    ).compile()
    result_pd = result.toPandas()

    df = table.compile().toPandas()
    expected_win_1 = (df.set_index('time').groupby('key').value.rolling(
        '1h', closed='both').mean().rename('mean_1h')).reset_index(drop=True)
    expected_win_2 = (df.set_index('time').groupby('key').value.rolling(
        '2h', closed='both').mean().rename('mean_2h')).reset_index(drop=True)
    tm.assert_series_equal(result_pd['mean_1h'], expected_win_1)
    tm.assert_series_equal(result_pd['mean_2h'], expected_win_2)
Ejemplo n.º 23
0
def test_window_equals(alltypes):
    t = alltypes
    w1 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.b)
    w2 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.b)
    assert w1.equals(w2)

    w3 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.c)
    assert not w1.equals(w3)

    w4 = ibis.range_window(preceding=ibis.interval(hours=3), group_by=t.d)
    w5 = ibis.range_window(preceding=ibis.interval(hours=3), group_by=t.d)
    assert w4.equals(w5)

    w6 = ibis.range_window(preceding=ibis.interval(hours=1), group_by=t.d)
    assert not w4.equals(w6)

    w7 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5)),
                              group_by=t.a,
                              order_by=t.b)
    w8 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5)),
                              group_by=t.a,
                              order_by=t.b)
    assert w7.equals(w8)

    w9 = ibis.trailing_window(rows_with_max_lookback(3,
                                                     ibis.interval(months=5)),
                              group_by=t.a,
                              order_by=t.b)
    assert not w7.equals(w9)
Ejemplo n.º 24
0
def test_keyed_asof_join_with_tolerance(
        time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2):
    expr = time_keyed_left.asof_join(
        time_keyed_right,
        'time',
        by='key',
        tolerance=2 * ibis.interval(days=1)
    )[time_keyed_left, time_keyed_right.other_value]
    result = expr.execute()
    expected = pd.merge_asof(
        time_keyed_df1, time_keyed_df2,
        on='time', by='key', tolerance=pd.Timedelta('2D'))
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 25
0
def test_context_adjustment_filter_before_window(alltypes, context, ctx_col):
    window = ibis.trailing_window(ibis.interval(days=3), order_by=ORDER_BY_COL)

    expr = alltypes[alltypes['bool_col']]
    expr = expr.mutate(v1=expr[TARGET_COL].count().over(window))

    result = expr.execute(timecontext=context)

    expected = expr.execute()
    expected = filter_by_time_context(expected, context)
    expected = expected.reset_index(drop=True)

    tm.assert_frame_equal(result, expected)
Ejemplo n.º 26
0
def test_context_adjustment_multi_window(time_table, time_df3):
    expected_win_1 = (
        time_df3.compute()
        .set_index('time')
        .rename(columns={'value': 'v1'})['v1']
        .rolling('3d', closed='both')
        .mean()
    )
    expected_win_1 = expected_win_1[
        expected_win_1.index >= Timestamp('20170105')
    ].reset_index(drop=True)

    expected_win_2 = (
        time_df3.compute()
        .set_index('time')
        .rename(columns={'value': 'v2'})['v2']
        .rolling('2d', closed='both')
        .mean()
    )
    expected_win_2 = expected_win_2[
        expected_win_2.index >= Timestamp('20170105')
    ].reset_index(drop=True)

    context = Timestamp('20170105'), Timestamp('20170111')
    window1 = ibis.trailing_window(
        3 * ibis.interval(days=1), order_by=time_table.time
    )
    window2 = ibis.trailing_window(
        2 * ibis.interval(days=1), order_by=time_table.time
    )
    expr = time_table.mutate(
        v1=time_table['value'].mean().over(window1),
        v2=time_table['value'].mean().over(window2),
    )
    result = expr.execute(timecontext=context)

    tm.assert_series_equal(result["v1"], expected_win_1)
    tm.assert_series_equal(result["v2"], expected_win_2)
Ejemplo n.º 27
0
def test_timestamp_deltas(table, unit, compiled_unit):
    f = '`i`'

    K = 5

    offset = ibis.interval(**{unit: K})

    add_expr = table.i + offset
    result = translate(add_expr)
    assert result == f'date_add({f}, INTERVAL {K} {compiled_unit})'

    sub_expr = table.i - offset
    result = translate(sub_expr)
    assert result == f'date_sub({f}, INTERVAL {K} {compiled_unit})'
Ejemplo n.º 28
0
def test_replace_window(alltypes):
    t = alltypes
    w1 = ibis.window(preceding=5, following=1, group_by=t.a, order_by=t.b)
    w2 = w1.group_by(t.c)
    expected = ibis.window(
        preceding=5, following=1, group_by=[t.a, t.c], order_by=t.b
    )
    assert_equal(w2, expected)

    w3 = w1.order_by(t.d)
    expected = ibis.window(
        preceding=5, following=1, group_by=t.a, order_by=[t.b, t.d]
    )
    assert_equal(w3, expected)

    w4 = ibis.trailing_window(
        rows_with_max_lookback(3, ibis.interval(months=3))
    )
    w5 = w4.group_by(t.a)
    expected = ibis.trailing_window(
        rows_with_max_lookback(3, ibis.interval(months=3)), group_by=t.a
    )
    assert_equal(w5, expected)
Ejemplo n.º 29
0
def test_keyed_asof_join_with_tolerance(
    time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2
):
    expr = time_keyed_left.asof_join(
        time_keyed_right, 'time', by='key', tolerance=2 * ibis.interval(days=1)
    )[time_keyed_left, time_keyed_right.other_value]
    result = expr.execute()
    expected = pd.merge_asof(
        time_keyed_df1,
        time_keyed_df2,
        on='time',
        by='key',
        tolerance=pd.Timedelta('2D'),
    )
    tm.assert_frame_equal(result[expected.columns], expected)
Ejemplo n.º 30
0
def test_window_with_preceding_expr(index):
    time = pd.date_range('20180101', '20180110')
    start = 2
    data = np.arange(start, start + len(time))
    df = pd.DataFrame({'value': data, 'time': time}, index=index(time))
    client = ibis.pandas.connect({'df': df})
    t = client.table('df')
    expected = (df.set_index('time').value.rolling(
        '3d', closed='both').mean().reset_index(drop=True))
    expected.index.name = None
    day = ibis.interval(days=1)
    window = ibis.trailing_window(3 * day, order_by=t.time)
    expr = t.value.mean().over(window)
    result = expr.execute()
    tm.assert_series_equal(result, expected)
Ejemplo n.º 31
0
def test_context_adjustment_window(time_table, time_df3):
    # trim data manually
    expected = (time_df3.set_index('time').value.rolling('3d',
                                                         closed='both').mean())
    expected = expected[
        expected.index >= pd.Timestamp('20170105')].reset_index(drop=True)

    context = pd.Timestamp('20170105'), pd.Timestamp('20170111')

    # expected.index.name = None
    window = ibis.trailing_window(3 * ibis.interval(days=1),
                                  order_by=time_table.time)
    expr = time_table['value'].mean().over(window)
    # result should adjust time context accordingly
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result, expected)
Ejemplo n.º 32
0
def test_adjust_context_scope(client):
    """Test that `adjust_context` has access to `scope` by default."""
    table = client.table('time_indexed_table')

    # WindowOp is the only context-adjusted node that the PySpark backend
    # can compile. Ideally we would test the context adjustment logic for
    # WindowOp itself, but building this test like that would unfortunately
    # affect other tests that involve WindowOp.
    # To avoid that, we'll create a dummy subclass of WindowOp and build the
    # test around that.

    class CustomWindowOp(ops.WindowOp):
        pass

    # Tell the Spark backend compiler it should compile CustomWindowOp just
    # like WindowOp
    compiles(CustomWindowOp)(compile_window_op)

    # Create an `adjust_context` function for this subclass that simply checks
    # that `scope` is passed in.
    @adjust_context.register(CustomWindowOp)
    def adjust_context_window_check_scope(
        op: CustomWindowOp,
        scope: Scope,
        timecontext: TimeContext,
    ) -> TimeContext:
        """Confirms that `scope` is passed in."""
        assert scope is not None
        return timecontext

    # Do an operation that will trigger context adjustment
    # on a CustomWindowOp
    value_count = table['value'].count()
    win = ibis.window(
        ibis.interval(hours=1),
        0,
        order_by='time',
        group_by='key',
    )
    # the argument needs to be pull out from the alias
    # any extensions must do the same
    value_count_over_win = CustomWindowOp(value_count.op().arg, win).to_expr()

    expr = table.mutate(value_count_over_win.name('value_count_over_win'))

    context = (pd.Timestamp('20170105'), pd.Timestamp('20170111'))
    expr.execute(timecontext=context)
Ejemplo n.º 33
0
def test_max_rows_with_lookback_validate(alltypes):
    t = alltypes
    mlb = rows_with_max_lookback(3, ibis.interval(days=5))
    window = ibis.trailing_window(mlb, order_by=t.i)
    t.f.lag().over(window)

    window = ibis.trailing_window(mlb)
    with pytest.raises(com.IbisInputError):
        t.f.lag().over(window)

    window = ibis.trailing_window(mlb, order_by=t.a)
    with pytest.raises(com.IbisInputError):
        t.f.lag().over(window)

    window = ibis.trailing_window(mlb, order_by=[t.i, t.a])
    with pytest.raises(com.IbisInputError):
        t.f.lag().over(window)
Ejemplo n.º 34
0
def test_window_with_preceding_expr():
    index = pd.date_range('20180101', '20180110')
    start = 2
    data = np.arange(start, start + len(index))
    df = pd.DataFrame({'value': data, 'time': index}, index=index)
    client = ibis.pandas.connect({'df': df})
    t = client.table('df')
    expected = (
        df.set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
        .reset_index(drop=True)
    )
    expected.index.name = None
    day = ibis.interval(days=1)
    window = ibis.trailing_window(3 * day, order_by=t.time)
    expr = t.value.mean().over(window)
    result = expr.execute()
    tm.assert_series_equal(result, expected)
Ejemplo n.º 35
0
    expr = t.mutate(win_avg=t.float_col.mean().over(w3))
    result = expr.compile()
    expected = """\
SELECT *,
       avg(`float_col`) OVER (PARTITION BY `year` ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN 4 PRECEDING AND 2 PRECEDING) AS `win_avg`
FROM `{}.testing.functional_alltypes`""".format(  # noqa: E501
        project_id
    )
    assert result == expected


@pytest.mark.parametrize(
    ('preceding', 'value'),
    [
        (5, 5),
        (ibis.interval(nanoseconds=1), 0.001),
        (ibis.interval(microseconds=1), 1),
        (ibis.interval(seconds=1), 1000000),
        (ibis.interval(minutes=1), 1000000 * 60),
        (ibis.interval(hours=1), 1000000 * 60 * 60),
        (ibis.interval(days=1), 1000000 * 60 * 60 * 24),
        (2 * ibis.interval(days=1), 1000000 * 60 * 60 * 24 * 2),
        (ibis.interval(weeks=1), 1000000 * 60 * 60 * 24 * 7),
    ],
)
def test_trailing_range_window(alltypes, preceding, value, project_id):
    t = alltypes
    w = ibis.trailing_range_window(
        preceding=preceding, order_by=t.timestamp_col
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
Ejemplo n.º 36
0
    [
        (rlz.list_of(rlz.double, min_length=2), [1]),
        (rlz.list_of(rlz.integer), 1.1),
        (rlz.list_of(rlz.string), 'asd'),
        (rlz.list_of(identity), 3),
    ],
)
def test_invalid_list_of(validator, values):
    with pytest.raises(IbisTypeError):
        validator(values)


@pytest.mark.parametrize(
    ('units', 'value', 'expected'),
    [
        ({'H', 'D'}, ibis.interval(days=3), ibis.interval(days=3)),
        (['Y'], ibis.interval(years=3), ibis.interval(years=3)),
    ],
)
def test_valid_interval(units, value, expected):
    result = rlz.interval(value, units=units)
    assert result.equals(expected)


@pytest.mark.parametrize(
    ('units', 'value', 'expected'),
    [
        ({'Y'}, ibis.interval(hours=1), IbisTypeError),
        ({'Y', 'M', 'D'}, ibis.interval(hours=1), IbisTypeError),
        ({'Q', 'W', 'D'}, ibis.interval(seconds=1), IbisTypeError),
    ],
Ejemplo n.º 37
0
pytestmark = pytest.mark.pandas


@pytest.fixture(scope='session')
def sort_kind():
    return 'mergesort'


default = pytest.mark.parametrize('default', [ibis.NA, ibis.literal('a')])
row_offset = pytest.mark.parametrize(
    'row_offset', list(map(ibis.literal, [-1, 1, 0]))
)
range_offset = pytest.mark.parametrize(
    'range_offset',
    [
        ibis.interval(days=1),
        2 * ibis.interval(days=1),
        -2 * ibis.interval(days=1),
    ],
)


@pytest.fixture
def row_window():
    return ibis.window(following=0, order_by='plain_int64')


@pytest.fixture
def range_window():
    return ibis.window(following=0, order_by='plain_datetimes_naive')
Ejemplo n.º 38
0
    array = alltypes.date_string_col.split('/')
    month, day, year = array[0], array[1], array[2]
    date_col = ibis.literal('-').join(['20' + year, month, day]).cast('date')
    with pytest.raises(TypeError):
        date_col + interval


date_value = pd.Timestamp('2017-12-31')
timestamp_value = pd.Timestamp('2018-01-01 18:18:18')


@pytest.mark.parametrize(
    ('expr_fn', 'expected_fn'),
    [
        param(
            lambda t, be: t.timestamp_col + ibis.interval(days=4),
            lambda t, be: t.timestamp_col + pd.Timedelta(days=4),
            id='timestamp-add-interval',
        ),
        param(
            lambda t, be: t.timestamp_col - ibis.interval(days=17),
            lambda t, be: t.timestamp_col - pd.Timedelta(days=17),
            id='timestamp-subtract-interval',
        ),
        param(
            lambda t, be: t.timestamp_col.date() + ibis.interval(days=4),
            lambda t, be: t.timestamp_col.dt.floor('d') + pd.Timedelta(days=4),
            id='date-add-interval',
        ),
        param(
            lambda t, be: t.timestamp_col.date() - ibis.interval(days=14),
Ejemplo n.º 39
0
def test_comparison_timestamp(alltypes):
    expr = alltypes.i > alltypes.i.min() + ibis.interval(days=3)
    assert isinstance(expr, ir.BooleanColumn)
Ejemplo n.º 40
0
    s = t.a + t.d
    assert s.type().nullable is True

    s = t.b + t.d
    assert s.type().nullable is True

    s = t.b + t.f
    assert s.type().nullable is False


@pytest.mark.parametrize(
    'base_expr',
    [
        ibis.table([('interval_col', dt.Interval(unit='D'))]).interval_col,
        ibis.interval(seconds=42),
    ],
)
def test_interval_negate(base_expr):
    expr = -base_expr
    expr2 = base_expr.negate()
    expr3 = ibis.negate(base_expr)
    assert isinstance(expr.op(), ops.Negate)
    assert expr.equals(expr2)
    assert expr.equals(expr3)


def test_large_timestamp():
    expr = ibis.timestamp('4567-02-03')
    expected = datetime(year=4567, month=2, day=3)
    result = expr.op().value