Esempio n. 1
0
def test_map_length_expr(t):
    expr = t.map_of_integers_strings.length()
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series([0, None, 2], name='map_of_integers_strings'), npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 2
0
def test_context_adjustment_window_groupby_id(time_table, time_df3):
    """This test case is meant to test trim_window_result method
    in dask/execution/window.py to see if it could trim Series
    correctly with groupby params
    """
    expected = (
        time_df3.compute()
        .set_index('time')
        .groupby('id')
        .value.rolling('3d', closed='both')
        .mean()
    )
    # This is a MultiIndexed Series
    expected = expected.reset_index()
    expected = expected[expected.time >= Timestamp('20170105')].reset_index(
        drop=True
    )['value']

    context = Timestamp('20170105'), Timestamp('20170111')

    # expected.index.name = None
    window = ibis.trailing_window(
        3 * ibis.interval(days=1), group_by='id', order_by=time_table.time
    )
    expr = time_table['value'].mean().over(window)
    # result should adjust time context accordingly
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result, expected)
Esempio n. 3
0
def test_setting_timecontext_in_scope(time_table, time_df3):
    expected_win_1 = (
        time_df3.compute()
        .set_index('time')
        .value.rolling('3d', closed='both')
        .mean()
    )
    expected_win_1 = expected_win_1[
        expected_win_1.index >= Timestamp('20170105')
    ].reset_index(drop=True)

    context = Timestamp('20170105'), Timestamp('20170111')
    window1 = ibis.trailing_window(
        3 * ibis.interval(days=1), order_by=time_table.time
    )
    """
    In the following expression, Selection node will be executed first and
    get table in context ('20170105', '20170101'). Then in window execution
    table will be executed again with a larger context adjusted by window
    preceeding days ('20170102', '20170111'). To get the correct result,
    the cached table result with a smaller context must be discard and updated
    to a larger time range.
    """
    expr = time_table.mutate(value=time_table['value'].mean().over(window1))
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result["value"], expected_win_1)
Esempio n. 4
0
def test_times_ops(t, df):
    result = t.plain_datetimes_naive.time().between('10:00', '10:00').compile()
    expected = dd.from_array(np.zeros(len(df), dtype=bool))
    tm.assert_series_equal(result.compute(), expected.compute())

    result = t.plain_datetimes_naive.time().between('01:00', '02:00').compile()
    expected = dd.from_array(np.ones(len(df), dtype=bool))
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 5
0
def test_where_long(batting, batting_df):
    col_expr = batting['AB']
    result = ibis.where(col_expr > col_expr.mean(), col_expr, 0.0).compile()

    ser = batting_df['AB']
    expected = ser.where(ser > ser.mean(), other=0.0)

    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 6
0
def test_struct_field_series(struct_table):
    t = struct_table
    expr = t.s['fruit']
    result = expr.execute()
    expected = dd.from_pandas(
        pd.Series(["apple", "pear", "pear"], name="fruit"), npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 7
0
def test_map_value_for_key_literal_broadcast(t):
    lookup_table = ibis.literal({'a': 1, 'b': 2, 'c': 3, 'd': 4})
    expr = lookup_table.get(t.dup_strings)
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series([4, 1, 4], name='dup_strings'), npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 8
0
def test_string_ops(t, df, case_func, expected_func):

    # ignore matching UserWarnings
    with catch_warnings(record=True):
        expr = case_func(t.strings_with_space)
        result = expr.execute()
        series = expected_func(df.strings_with_space)
        tm.assert_series_equal(result.compute(), series.compute())
Esempio n. 9
0
def test_where_series(t, df):
    col_expr = t['plain_int64']
    result = ibis.where(col_expr > col_expr.mean(), col_expr, 0.0).compile()

    ser = df['plain_int64']
    expected = ser.where(ser > ser.mean(), other=0.0)

    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 10
0
def test_map_value_for_key_expr(t):
    expr = t.map_of_integers_strings[1]
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series([None, None, 'a'], name='map_of_integers_strings'),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 11
0
def test_array_concat(t, df, op):
    x = t.array_of_float64.cast('array<string>')
    y = t.array_of_strings
    expr = op(x, y)
    result = expr.compile()
    expected = op(
        df.array_of_float64.apply(lambda x: list(map(str, x))),
        df.array_of_strings,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 12
0
def test_cast_datetime_strings_to_timestamp(t, df, column):
    expr = t[column].cast('timestamp')
    result = expr.compile()
    df_computed = df.compute()
    expected = dd.from_pandas(
        pd.to_datetime(df_computed[column], infer_datetime_format=True),
        npartitions=1,
    )
    if getattr(expected.dtype, 'tz', None) is not None:
        expected = expected.dt.tz_convert(None)
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 13
0
def test_cast_datetime_strings_to_date(t, df, column):
    expr = t[column].cast('date')
    result = expr.compile()
    df_computed = df.compute()
    expected = dd.from_pandas(
        pd.to_datetime(
            df_computed[column], infer_datetime_format=True,
        ).dt.normalize(),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 14
0
def test_round_decimal_with_negative_places(t, df):
    type = dt.Decimal(12, 3)
    expr = t.float64_as_strings.cast(type).round(-1)
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series(
            list(map(decimal.Decimal, ['1.0E+2', '2.3E+2', '-1.00E+3'])),
            name='float64_as_strings',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 15
0
def test_map_values_expr(t):
    expr = t.map_of_complex_values.values()
    result = expr.compile().map(safe_sorter)
    expected = dd.from_pandas(
        pd.Series(
            [None, [[], [1, 2, 3]], []],
            dtype='object',
            name='map_of_complex_values',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 16
0
def test_map_keys_expr(t):
    expr = t.map_of_strings_integers.keys()
    result = expr.compile().map(safe_sorter)
    expected = dd.from_pandas(
        pd.Series(
            [['a', 'b'], None, []],
            dtype='object',
            name='map_of_strings_integers',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 17
0
def test_times_ops_with_tz(t, df, tz, rconstruct, column):
    expected = dd.from_array(rconstruct(len(df), dtype=bool), )
    time = t[column].time()
    expr = time.between('01:00', '02:00', timezone=tz)
    result = expr.compile()
    tm.assert_series_equal(result.compute(), expected.compute())

    # Test that casting behavior is the same as using the timezone kwarg
    ts = t[column].cast(dt.Timestamp(timezone=tz))
    expr = ts.time().between('01:00', '02:00')
    result = expr.compile()
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 18
0
def test_map_value_or_default_for_key_expr(t):
    expr = t.map_of_complex_values.get('a')
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series(
            [None, [1, 2, 3], None],
            dtype='object',
            name='map_of_complex_values',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 19
0
def test_quantile_groupby(batting, batting_df):
    def q_fun(x, quantile, interpolation):
        res = x.quantile(quantile, interpolation=interpolation).tolist()
        return [res for _ in range(len(x))]

    frac = 0.2
    intp = 'linear'
    result = (batting.groupby('teamID').mutate(
        res=lambda x: x.RBI.quantile([frac, 1 - frac], intp)).res.execute())
    expected = (batting_df.groupby('teamID').RBI.transform(
        q_fun, quantile=[frac, 1 - frac], interpolation=intp).rename('res'))
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 20
0
def test_simple_case_column(batting, batting_df):
    t = batting
    df = batting_df
    expr = (t.RBI.case().when(5, 'five').when(4, 'four').when(
        3, 'three').else_('could be good?').end())
    result = expr.execute()
    expected = dd.from_array(
        np.select(
            [df.RBI == 5, df.RBI == 4, df.RBI == 3],
            ['five', 'four', 'three'],
            'could be good?',
        ))
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 21
0
def test_searched_case_column(batting, batting_df):
    t = batting
    df = batting_df
    expr = (ibis.case().when(t.RBI < 5, 'really bad team').when(
        t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end())
    result = expr.execute()
    expected = dd.from_array(
        np.select(
            [df.RBI < 5, df.teamID == 'PH1'],
            ['really bad team', 'ph1 team'],
            df.teamID,
        ))
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 22
0
def test_cast_integer_to_date(t, df):
    expr = t.plain_int64.cast('date')
    result = expr.compile()
    df_computed = df.compute()
    expected = dd.from_pandas(
        pd.Series(
            pd.to_datetime(df_computed.plain_int64.values, unit='D').values,
            index=df_computed.index,
            name='plain_int64',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 23
0
def test_cast_datetime_strings_to_date(t, df, column):
    # TODO - this is changed from the pandas test, double check
    expr = t[column].cast('date')
    result = expr.execute()
    df_computed = df.compute()
    expected = dd.from_pandas(
        pd.to_datetime(
            df_computed[column],
            infer_datetime_format=True,
        ).dt.normalize(),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 24
0
def test_nullif_inf(npartitions):
    df = dd.from_pandas(
        pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}),
        npartitions=npartitions,
    )
    con = ibis.dask.connect({'t': df})
    t = con.table('t')
    expr = t.a.nullif(np.inf).nullif(-np.inf)
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series([np.nan, 3.14, np.nan, 42.0], name='a'),
        npartitions=npartitions,
    ).reset_index(drop=True)  # match dask reset index behavior
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 25
0
def test_cast_integer_to_temporal_type(t, df, column):
    column_type = t[column].type()
    expr = t.plain_int64.cast(column_type)
    result = expr.compile()
    df_computed = df.compute()
    expected = dd.from_pandas(
        pd.Series(
            pd.to_datetime(df_computed.plain_int64.values, unit='ns').values,
            index=df_computed.index,
            name='plain_int64',
        ).dt.tz_localize(column_type.timezone),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 26
0
def test_nullif_inf():
    df = dd.from_pandas(
        pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}),
        npartitions=1,
    )
    con = connect(dict(t=df))
    t = con.table('t')
    expr = t.a.nullif(np.inf).nullif(-np.inf)
    result = expr.execute()
    expected = dd.from_pandas(
        pd.Series([np.nan, 3.14, np.nan, 42.0], name='a'),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 27
0
def test_interval_arithmetic(op, expected):
    data = pd.timedelta_range('0 days', '10 days', freq='D')
    pandas_df = pd.DataFrame({'td': data})
    con = Backend().connect({
        'df1': dd.from_pandas(pandas_df, npartitions=1),
        'df2': dd.from_pandas(pandas_df, npartitions=1),
    })
    t1 = con.table('df1')
    expr = op(t1.td, t1.td)
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series(expected(data, data), name='td'),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 28
0
def test_context_adjustment_window(time_table, time_df3, interval_ibis,
                                   interval_pd):
    # trim data manually
    expected = (time_df3.set_index('time').value.rolling(interval_pd,
                                                         closed='both').mean())
    expected = expected[expected.index >= Timestamp('20170105')].reset_index(
        drop=True)

    context = Timestamp('20170105'), Timestamp('20170111')

    window = ibis.trailing_window(interval_ibis, order_by=time_table.time)
    expr = time_table['value'].mean().over(window)
    # result should adjust time context accordingly
    result = expr.execute(timecontext=context)
    tm.assert_series_equal(result, expected)
Esempio n. 29
0
def test_map_concat_expr(t):
    expr = t.map_of_complex_values + {'b': [4, 5, 6], 'c': [], 'a': []}
    result = expr.compile()
    expected = dd.from_pandas(
        pd.Series(
            [
                None,
                {'a': [], 'b': [4, 5, 6], 'c': []},
                {'b': [4, 5, 6], 'c': [], 'a': []},
            ],
            dtype='object',
            name='map_of_complex_values',
        ),
        npartitions=1,
    )
    tm.assert_series_equal(result.compute(), expected.compute())
Esempio n. 30
0
def test_cast_to_decimal(t, df, type):
    expr = t.float64_as_strings.cast(type)
    result = expr.compile()
    context = decimal.Context(prec=type.precision)
    expected = df.float64_as_strings.apply(
        lambda x: context.create_decimal(x).quantize(
            decimal.Decimal('{}.{}'.format('0' * (type.precision - type.scale),
                                           '0' * type.scale))),
        meta=("float64_as_strings", "object"),
    )
    tm.assert_series_equal(result.compute(), expected.compute())
    assert all(
        abs(element.as_tuple().exponent) == type.scale
        for element in result.compute().values)
    assert all(1 <= len(element.as_tuple().digits) <= type.precision
               for element in result.compute().values)