def test_frame_limit(t, df, offset): n = 5 df_expr = t.limit(n, offset=offset) result = df_expr.execute() expected = df.loc[offset:offset + n].reset_index(drop=True) tm.assert_frame_equal(result[expected.columns].compute(), expected.compute())
def test_outer_join(npartitions): df = dd.from_pandas( pd.DataFrame({"test": [1, 2, 3], "name": ["a", "b", "c"]}), npartitions=npartitions, ) df_2 = dd.from_pandas( pd.DataFrame({"test_2": [1, 5, 6], "name_2": ["d", "e", "f"]}), npartitions=npartitions, ) conn = ibis.dask.connect({"df": df, "df_2": df_2}) ibis_table_1 = conn.table("df") ibis_table_2 = conn.table("df_2") joined = ibis_table_1.outer_join( ibis_table_2, predicates=ibis_table_1["test"] == ibis_table_2["test_2"], ) result = joined.compile() expected = dd.merge( df, df_2, left_on="test", right_on="test_2", how="outer", ) tm.assert_frame_equal( result.compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_apply_to_schema_with_timezone(): data = {'time': pd.date_range('2018-01-01', '2018-01-02', freq='H')} df = dd.from_pandas(pd.DataFrame(data), npartitions=1) expected = df.assign(time=df.time.astype('datetime64[ns, EST]')) desired_schema = ibis.schema([('time', 'timestamp("EST")')]) result = desired_schema.apply_to(df.copy()) tm.assert_frame_equal(result.compute(), expected.compute())
def test_join_with_window_function(players_base, players_df, batting, batting_df): players = players_base # this should be semi_join tbl = batting.left_join(players, ['playerID']) t = tbl[batting.G, batting.playerID, batting.teamID] expr = t.groupby(t.teamID).mutate( team_avg=lambda d: d.G.mean(), demeaned_by_player=lambda d: d.G - d.G.mean(), ) result = expr.compile() expected = dd.merge(batting_df, players_df[['playerID']], on='playerID', how='left')[['G', 'playerID', 'teamID']] team_avg = expected.groupby('teamID').G.transform('mean') expected = expected.assign(team_avg=team_avg, demeaned_by_player=lambda df: df.G - team_avg) tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=4 * ibis.interval(days=1))[time_keyed_left, time_keyed_right.other_value] context = (Timestamp('20170105'), Timestamp('20170111')) result = expr.execute(timecontext=context) # compare with asof_join of manually trimmed tables trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1]] trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] - Timedelta(days=4)][ time_keyed_df2['time'] < context[1]] expected = dd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=Timedelta('4D'), ).compute() tm.assert_frame_equal(result, expected)
def test_intersect(client, df1, intersect_df2): t1 = client.table('df1') t2 = client.table('intersect_df2') expr = t1.intersect(t2) result = expr.compile() expected = df1.merge(intersect_df2, on=list(df1.columns)) tm.assert_frame_equal(result.compute(), expected.compute())
def test_select_on_unambiguous_join(how, func, npartitions): df_t = dd.from_pandas( pd.DataFrame({ 'a0': [1, 2, 3], 'b1': list("aab") }), npartitions=npartitions, ) df_s = dd.from_pandas( pd.DataFrame({ 'a1': [2, 3, 4], 'b2': list("abc") }), npartitions=npartitions, ) con = ibis.dask.connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") method = getattr(t, f"{how}_join") join = method(s, t.b1 == s.b2) expected = dd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"], how=how)[["a0", "a1"]] assert not expected.compute(scheduler='single-threaded').empty expr = func(join) result = expr.compile() tm.assert_frame_equal( result.compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_select_on_unambiguous_asof_join(func, npartitions): df_t = dd.from_pandas( pd.DataFrame({ 'a0': [1, 2, 3], 'b1': date_range("20180101", periods=3) }), npartitions=npartitions, ) df_s = dd.from_pandas( pd.DataFrame({ 'a1': [2, 3, 4], 'b2': date_range("20171230", periods=3) }), npartitions=npartitions, ) con = ibis.dask.connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") join = t.asof_join(s, t.b1 == s.b2) expected = dd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] assert not expected.compute(scheduler='single-threaded').empty expr = func(join) result = expr.compile() tm.assert_frame_equal( result.compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_summary_numeric_group_by(batting, batting_df): expr = batting.groupby('teamID').G.summary() result = expr.execute() expected = ( batting_df.groupby('teamID') .G.apply( lambda s: pd.DataFrame( { 'count': s.count(), 'nulls': s.isnull().sum(), 'min': s.min(), 'max': s.max(), 'sum': s.sum(), 'mean': s.mean(), 'approx_nunique': s.nunique(), }, index=[0], ) ) .compute() .reset_index(level=1, drop=True) .reset_index() ) columns = expected.columns tm.assert_frame_equal(result[columns], expected)
def test_union_with_list_types(t, df, distinct): expr = t.union(t, distinct=distinct) result = expr.compile() expected = ( df if distinct else dd.concat([df, df], axis=0, ignore_index=True) ) tm.assert_frame_equal(result.compute(), expected.compute())
def test_mutate(t, df): expr = t.mutate(x=t.plain_int64 + 1, y=t.plain_int64 * 2) result = expr.compile() expected = df.assign(x=df.plain_int64 + 1, y=df.plain_int64 * 2) tm.assert_frame_equal( result[expected.columns].compute(), expected.compute() )
def test_union(client, df1, distinct): t = client.table('df1') expr = t.union(t, distinct=distinct) result = expr.execute() expected = (df1 if distinct else dd.concat( [df1, df1], axis=0, ignore_index=True)) tm.assert_frame_equal(result.compute(), expected.compute())
def test_multi_join_with_post_expression_filter(how, left, df1): lhs = left[['key', 'key2']] rhs = left[['key2', 'value']] rhs2 = left[['key2', 'value']].relabel({'value': 'value2'}) joined = lhs.join(rhs, 'key2', how=how) projected = joined[lhs, rhs.value] filtered = projected[projected.value == 4] joined2 = filtered.join(rhs2, 'key2') projected2 = joined2[filtered.key, rhs2.value2] expr = projected2[projected2.value2 == 3] result = expr.compile() df1 = lhs.compile() df2 = rhs.compile() df3 = rhs2.compile() expected = dd.merge(df1, df2, on='key2', how=how) expected = expected.loc[expected.value == 4].reset_index(drop=True) expected = dd.merge(expected, df3, on='key2')[['key', 'value2']] expected = expected.loc[expected.value2 == 3].reset_index(drop=True) tm.assert_frame_equal( result.compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_summary_numeric_group_by(batting, batting_df): expr = batting.groupby('teamID').G.summary() result = expr.execute() expected = ( batting_df.groupby('teamID') .G.apply( lambda s: dd.from_pandas( pd.DataFrame( dict( count=s.count(), nulls=s.isnull().sum(), min=s.min(), max=s.max(), sum=s.sum(), mean=s.mean(), approx_nunique=s.nunique(), ), index=[0], ), npartitions=1, ) ) .reset_index(level=1, drop=True) .reset_index() ) columns = expected.columns tm.assert_frame_equal(result[columns], expected)
def test_array_collect(t, df): expr = t.group_by( t.dup_strings).aggregate(collected=t.float64_with_zeros.collect()) result = expr.compile() expected = (df.groupby('dup_strings').float64_with_zeros.apply( list).reset_index().rename( columns={'float64_with_zeros': 'collected'})) tm.assert_frame_equal(result.compute(), expected.compute())
def test_array_repeat(t, df, n, mul): expr = t.projection([mul(t.array_of_strings, n).name('repeated')]) result = expr.compile() expected = dd.from_pandas( pd.DataFrame({'repeated': df.array_of_strings * n}), npartitions=1, ) tm.assert_frame_equal(result.compute(), expected.compute())
def test_group_concat(t, df): expr = t.groupby( t.dup_strings).aggregate(foo=t.plain_int64.group_concat(',')) result = expr.execute() expected = (df.groupby('dup_strings').apply(lambda df: ','.join( df.plain_int64.astype(str))).reset_index().rename(columns={0: 'foo'})) tm.assert_frame_equal(result[expected.columns].compute(), expected.compute())
def test_left_binary_op_gb(t, df, op, argfunc): expr = t.groupby('dup_strings').aggregate(foo=op( *argfunc(t.float64_with_zeros)).sum()) result = expr.execute() expected = (df.groupby('dup_strings').float64_with_zeros.apply( lambda s: op(*argfunc(s)).sum()).reset_index().rename( columns={'float64_with_zeros': 'foo'})) tm.assert_frame_equal(result.compute(), expected.compute())
def test_cast_on_group_by(t, df): expr = t.groupby(t.dup_strings).aggregate( casted=(t.float64_with_zeros == 0).cast('int64').sum()) result = expr.execute() expected = (df.compute().groupby('dup_strings').float64_with_zeros.apply( lambda s: (s == 0).astype('int64').sum()).reset_index().rename( columns={'float64_with_zeros': 'casted'})) tm.assert_frame_equal(result.compute(), expected)
def test_selection(t, df): expr = t[((t.plain_strings == 'a') | (t.plain_int64 == 3)) & (t.dup_strings == 'd')] result = expr.execute() expected = df[((df.plain_strings == 'a') | (df.plain_int64 == 3)) & (df.dup_strings == 'd')].reset_index(drop=True) tm.assert_frame_equal(result[expected.columns].compute(), expected.compute())
def test_adjust_context_complete_shift( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2, ): """Test `adjust_context` function that completely shifts the context. This results in an adjusted context that is NOT a subset of the original context. This is unlike an `adjust_context` function that only expands the context. See #3104 """ # Create a contrived `adjust_context` function for # CustomAsOfJoin to mock this. @adjust_context.register(CustomAsOfJoin) def adjust_context_custom_asof_join( op: ops.AsOfJoin, scope: Scope, timecontext: TimeContext, ) -> TimeContext: """Shifts both the begin and end in the same direction.""" begin, end = timecontext timedelta = execute(op.tolerance) return (begin - timedelta, end - timedelta) expr = CustomAsOfJoin( left=time_keyed_left, right=time_keyed_right, predicates='time', by='key', tolerance=ibis.interval(days=4), ).to_expr() expr = expr[time_keyed_left, time_keyed_right.other_value] context = (Timestamp('20170101'), Timestamp('20170111')) result = expr.execute(timecontext=context) # Compare with asof_join of manually trimmed tables # Left table: No shift for context # Right table: Shift both begin and end of context by 4 days trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1] ] trimmed_df2 = time_keyed_df2[ time_keyed_df2['time'] >= context[0] - Timedelta(days=4) ][time_keyed_df2['time'] < context[1] - Timedelta(days=4)] expected = dd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=Timedelta('4D'), ).compute() tm.assert_frame_equal(result, expected)
def test_struct_field_series_group_by_key(struct_table): t = struct_table expr = t.groupby(t.s['fruit']).aggregate(total=t.value.sum()) result = expr.compile() expected = dd.from_pandas( pd.DataFrame([("apple", 1), ("pear", 5)], columns=["fruit", "total"]), npartitions=1, ) tm.assert_frame_equal(result.compute(), expected.compute())
def test_join_project_left_table(how, left, right, df1, df2): expr = left.join(right, left.key == right.key, how=how)[left, right.key3] result = expr.compile() expected = dd.merge(df1, df2, how=how, on='key')[list(left.columns) + ['key3']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_sort_by(t, df, column, key, dask_by, dask_ascending): expr = t.sort_by(key(t, column)) result = expr.compile() expected = ( df.compute() .sort_values(dask_by(column), ascending=dask_ascending) .reset_index(drop=True) ) tm.assert_frame_equal(result[expected.columns].compute(), expected)
def test_join(how, left, right, df1, df2): expr = left.join(right, left.key == right.key, how=how)[left, right.other_value, right.key3] result = expr.compile() expected = dd.merge(df1, df2, how=how, on='key') tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_asof_join(time_left, time_right, time_df1, time_df2): expr = time_left.asof_join(time_right, 'time')[time_left, time_right.other_value] result = expr.compile() expected = dd.merge_asof(time_df1, time_df2, on='time') tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_weighted_average(t, df): expr = t.groupby(t.dup_strings).aggregate( avg=(t.plain_float64 * t.plain_int64).sum() / t.plain_int64.sum()) result = expr.execute() expected = (df.groupby('dup_strings').apply( lambda df: (df.plain_int64 * df.plain_float64).sum( ) / df.plain_int64.sum()).reset_index().rename(columns={0: 'avg'})) tm.assert_frame_equal(result[expected.columns].compute(), expected.compute())
def test_cross_join_project_left_table(left, right, df1, df2): expr = left.cross_join(right)[left, right.key3] result = expr.compile() expected = dd.merge( df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy' ).rename(columns={'key_x': 'key'})[list(left.columns) + ['key3']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_struct_field_series_group_by_value(struct_table): t = struct_table expr = t.groupby(t.key).aggregate(total=t.s['weight'].sum()) result = expr.compile() # these are floats because we have a NULL value in the input data expected = dd.from_pandas( pd.DataFrame([("a", 0.0), ("b", 1.0)], columns=["key", "total"]), npartitions=1, ) tm.assert_frame_equal(result.compute(), expected.compute())
def test_join_with_post_expression_selection(how, left, right, df1, df2): join = left.join(right, left.key == right.key, how=how) expr = join[left.key, left.value, right.other_value] result = expr.compile() expected = dd.merge(df1, df2, on='key', how=how)[['key', 'value', 'other_value']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )