def test_map_length_expr(t): expr = t.map_of_integers_strings.length() result = expr.compile() expected = dd.from_pandas( pd.Series([0, None, 2], name='map_of_integers_strings'), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_context_adjustment_window_groupby_id(time_table, time_df3): """This test case is meant to test trim_window_result method in dask/execution/window.py to see if it could trim Series correctly with groupby params """ expected = ( time_df3.compute() .set_index('time') .groupby('id') .value.rolling('3d', closed='both') .mean() ) # This is a MultiIndexed Series expected = expected.reset_index() expected = expected[expected.time >= Timestamp('20170105')].reset_index( drop=True )['value'] context = Timestamp('20170105'), Timestamp('20170111') # expected.index.name = None window = ibis.trailing_window( 3 * ibis.interval(days=1), group_by='id', order_by=time_table.time ) expr = time_table['value'].mean().over(window) # result should adjust time context accordingly result = expr.execute(timecontext=context) tm.assert_series_equal(result, expected)
def test_setting_timecontext_in_scope(time_table, time_df3): expected_win_1 = ( time_df3.compute() .set_index('time') .value.rolling('3d', closed='both') .mean() ) expected_win_1 = expected_win_1[ expected_win_1.index >= Timestamp('20170105') ].reset_index(drop=True) context = Timestamp('20170105'), Timestamp('20170111') window1 = ibis.trailing_window( 3 * ibis.interval(days=1), order_by=time_table.time ) """ In the following expression, Selection node will be executed first and get table in context ('20170105', '20170101'). Then in window execution table will be executed again with a larger context adjusted by window preceeding days ('20170102', '20170111'). To get the correct result, the cached table result with a smaller context must be discard and updated to a larger time range. """ expr = time_table.mutate(value=time_table['value'].mean().over(window1)) result = expr.execute(timecontext=context) tm.assert_series_equal(result["value"], expected_win_1)
def test_times_ops(t, df): result = t.plain_datetimes_naive.time().between('10:00', '10:00').compile() expected = dd.from_array(np.zeros(len(df), dtype=bool)) tm.assert_series_equal(result.compute(), expected.compute()) result = t.plain_datetimes_naive.time().between('01:00', '02:00').compile() expected = dd.from_array(np.ones(len(df), dtype=bool)) tm.assert_series_equal(result.compute(), expected.compute())
def test_where_long(batting, batting_df): col_expr = batting['AB'] result = ibis.where(col_expr > col_expr.mean(), col_expr, 0.0).compile() ser = batting_df['AB'] expected = ser.where(ser > ser.mean(), other=0.0) tm.assert_series_equal(result.compute(), expected.compute())
def test_struct_field_series(struct_table): t = struct_table expr = t.s['fruit'] result = expr.execute() expected = dd.from_pandas( pd.Series(["apple", "pear", "pear"], name="fruit"), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_map_value_for_key_literal_broadcast(t): lookup_table = ibis.literal({'a': 1, 'b': 2, 'c': 3, 'd': 4}) expr = lookup_table.get(t.dup_strings) result = expr.compile() expected = dd.from_pandas( pd.Series([4, 1, 4], name='dup_strings'), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_string_ops(t, df, case_func, expected_func): # ignore matching UserWarnings with catch_warnings(record=True): expr = case_func(t.strings_with_space) result = expr.execute() series = expected_func(df.strings_with_space) tm.assert_series_equal(result.compute(), series.compute())
def test_where_series(t, df): col_expr = t['plain_int64'] result = ibis.where(col_expr > col_expr.mean(), col_expr, 0.0).compile() ser = df['plain_int64'] expected = ser.where(ser > ser.mean(), other=0.0) tm.assert_series_equal(result.compute(), expected.compute())
def test_map_value_for_key_expr(t): expr = t.map_of_integers_strings[1] result = expr.compile() expected = dd.from_pandas( pd.Series([None, None, 'a'], name='map_of_integers_strings'), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_array_concat(t, df, op): x = t.array_of_float64.cast('array<string>') y = t.array_of_strings expr = op(x, y) result = expr.compile() expected = op( df.array_of_float64.apply(lambda x: list(map(str, x))), df.array_of_strings, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_datetime_strings_to_timestamp(t, df, column): expr = t[column].cast('timestamp') result = expr.compile() df_computed = df.compute() expected = dd.from_pandas( pd.to_datetime(df_computed[column], infer_datetime_format=True), npartitions=1, ) if getattr(expected.dtype, 'tz', None) is not None: expected = expected.dt.tz_convert(None) tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_datetime_strings_to_date(t, df, column): expr = t[column].cast('date') result = expr.compile() df_computed = df.compute() expected = dd.from_pandas( pd.to_datetime( df_computed[column], infer_datetime_format=True, ).dt.normalize(), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_round_decimal_with_negative_places(t, df): type = dt.Decimal(12, 3) expr = t.float64_as_strings.cast(type).round(-1) result = expr.compile() expected = dd.from_pandas( pd.Series( list(map(decimal.Decimal, ['1.0E+2', '2.3E+2', '-1.00E+3'])), name='float64_as_strings', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_map_values_expr(t): expr = t.map_of_complex_values.values() result = expr.compile().map(safe_sorter) expected = dd.from_pandas( pd.Series( [None, [[], [1, 2, 3]], []], dtype='object', name='map_of_complex_values', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_map_keys_expr(t): expr = t.map_of_strings_integers.keys() result = expr.compile().map(safe_sorter) expected = dd.from_pandas( pd.Series( [['a', 'b'], None, []], dtype='object', name='map_of_strings_integers', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_times_ops_with_tz(t, df, tz, rconstruct, column): expected = dd.from_array(rconstruct(len(df), dtype=bool), ) time = t[column].time() expr = time.between('01:00', '02:00', timezone=tz) result = expr.compile() tm.assert_series_equal(result.compute(), expected.compute()) # Test that casting behavior is the same as using the timezone kwarg ts = t[column].cast(dt.Timestamp(timezone=tz)) expr = ts.time().between('01:00', '02:00') result = expr.compile() tm.assert_series_equal(result.compute(), expected.compute())
def test_map_value_or_default_for_key_expr(t): expr = t.map_of_complex_values.get('a') result = expr.compile() expected = dd.from_pandas( pd.Series( [None, [1, 2, 3], None], dtype='object', name='map_of_complex_values', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_quantile_groupby(batting, batting_df): def q_fun(x, quantile, interpolation): res = x.quantile(quantile, interpolation=interpolation).tolist() return [res for _ in range(len(x))] frac = 0.2 intp = 'linear' result = (batting.groupby('teamID').mutate( res=lambda x: x.RBI.quantile([frac, 1 - frac], intp)).res.execute()) expected = (batting_df.groupby('teamID').RBI.transform( q_fun, quantile=[frac, 1 - frac], interpolation=intp).rename('res')) tm.assert_series_equal(result.compute(), expected.compute())
def test_simple_case_column(batting, batting_df): t = batting df = batting_df expr = (t.RBI.case().when(5, 'five').when(4, 'four').when( 3, 'three').else_('could be good?').end()) result = expr.execute() expected = dd.from_array( np.select( [df.RBI == 5, df.RBI == 4, df.RBI == 3], ['five', 'four', 'three'], 'could be good?', )) tm.assert_series_equal(result.compute(), expected.compute())
def test_searched_case_column(batting, batting_df): t = batting df = batting_df expr = (ibis.case().when(t.RBI < 5, 'really bad team').when( t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end()) result = expr.execute() expected = dd.from_array( np.select( [df.RBI < 5, df.teamID == 'PH1'], ['really bad team', 'ph1 team'], df.teamID, )) tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_integer_to_date(t, df): expr = t.plain_int64.cast('date') result = expr.compile() df_computed = df.compute() expected = dd.from_pandas( pd.Series( pd.to_datetime(df_computed.plain_int64.values, unit='D').values, index=df_computed.index, name='plain_int64', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_datetime_strings_to_date(t, df, column): # TODO - this is changed from the pandas test, double check expr = t[column].cast('date') result = expr.execute() df_computed = df.compute() expected = dd.from_pandas( pd.to_datetime( df_computed[column], infer_datetime_format=True, ).dt.normalize(), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_nullif_inf(npartitions): df = dd.from_pandas( pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}), npartitions=npartitions, ) con = ibis.dask.connect({'t': df}) t = con.table('t') expr = t.a.nullif(np.inf).nullif(-np.inf) result = expr.compile() expected = dd.from_pandas( pd.Series([np.nan, 3.14, np.nan, 42.0], name='a'), npartitions=npartitions, ).reset_index(drop=True) # match dask reset index behavior tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_integer_to_temporal_type(t, df, column): column_type = t[column].type() expr = t.plain_int64.cast(column_type) result = expr.compile() df_computed = df.compute() expected = dd.from_pandas( pd.Series( pd.to_datetime(df_computed.plain_int64.values, unit='ns').values, index=df_computed.index, name='plain_int64', ).dt.tz_localize(column_type.timezone), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_nullif_inf(): df = dd.from_pandas( pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}), npartitions=1, ) con = connect(dict(t=df)) t = con.table('t') expr = t.a.nullif(np.inf).nullif(-np.inf) result = expr.execute() expected = dd.from_pandas( pd.Series([np.nan, 3.14, np.nan, 42.0], name='a'), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_interval_arithmetic(op, expected): data = pd.timedelta_range('0 days', '10 days', freq='D') pandas_df = pd.DataFrame({'td': data}) con = Backend().connect({ 'df1': dd.from_pandas(pandas_df, npartitions=1), 'df2': dd.from_pandas(pandas_df, npartitions=1), }) t1 = con.table('df1') expr = op(t1.td, t1.td) result = expr.compile() expected = dd.from_pandas( pd.Series(expected(data, data), name='td'), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_context_adjustment_window(time_table, time_df3, interval_ibis, interval_pd): # trim data manually expected = (time_df3.set_index('time').value.rolling(interval_pd, closed='both').mean()) expected = expected[expected.index >= Timestamp('20170105')].reset_index( drop=True) context = Timestamp('20170105'), Timestamp('20170111') window = ibis.trailing_window(interval_ibis, order_by=time_table.time) expr = time_table['value'].mean().over(window) # result should adjust time context accordingly result = expr.execute(timecontext=context) tm.assert_series_equal(result, expected)
def test_map_concat_expr(t): expr = t.map_of_complex_values + {'b': [4, 5, 6], 'c': [], 'a': []} result = expr.compile() expected = dd.from_pandas( pd.Series( [ None, {'a': [], 'b': [4, 5, 6], 'c': []}, {'b': [4, 5, 6], 'c': [], 'a': []}, ], dtype='object', name='map_of_complex_values', ), npartitions=1, ) tm.assert_series_equal(result.compute(), expected.compute())
def test_cast_to_decimal(t, df, type): expr = t.float64_as_strings.cast(type) result = expr.compile() context = decimal.Context(prec=type.precision) expected = df.float64_as_strings.apply( lambda x: context.create_decimal(x).quantize( decimal.Decimal('{}.{}'.format('0' * (type.precision - type.scale), '0' * type.scale))), meta=("float64_as_strings", "object"), ) tm.assert_series_equal(result.compute(), expected.compute()) assert all( abs(element.as_tuple().exponent) == type.scale for element in result.compute().values) assert all(1 <= len(element.as_tuple().digits) <= type.precision for element in result.compute().values)