Example #1
0
def test_frame_limit(t, df, offset):
    n = 5
    df_expr = t.limit(n, offset=offset)
    result = df_expr.execute()
    expected = df.loc[offset:offset + n].reset_index(drop=True)
    tm.assert_frame_equal(result[expected.columns].compute(),
                          expected.compute())
Example #2
0
def test_outer_join(npartitions):
    df = dd.from_pandas(
        pd.DataFrame({"test": [1, 2, 3], "name": ["a", "b", "c"]}),
        npartitions=npartitions,
    )
    df_2 = dd.from_pandas(
        pd.DataFrame({"test_2": [1, 5, 6], "name_2": ["d", "e", "f"]}),
        npartitions=npartitions,
    )

    conn = ibis.dask.connect({"df": df, "df_2": df_2})

    ibis_table_1 = conn.table("df")
    ibis_table_2 = conn.table("df_2")

    joined = ibis_table_1.outer_join(
        ibis_table_2,
        predicates=ibis_table_1["test"] == ibis_table_2["test_2"],
    )
    result = joined.compile()
    expected = dd.merge(
        df,
        df_2,
        left_on="test",
        right_on="test_2",
        how="outer",
    )
    tm.assert_frame_equal(
        result.compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #3
0
def test_apply_to_schema_with_timezone():
    data = {'time': pd.date_range('2018-01-01', '2018-01-02', freq='H')}
    df = dd.from_pandas(pd.DataFrame(data), npartitions=1)
    expected = df.assign(time=df.time.astype('datetime64[ns, EST]'))
    desired_schema = ibis.schema([('time', 'timestamp("EST")')])
    result = desired_schema.apply_to(df.copy())
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #4
0
def test_join_with_window_function(players_base, players_df, batting,
                                   batting_df):
    players = players_base

    # this should be semi_join
    tbl = batting.left_join(players, ['playerID'])
    t = tbl[batting.G, batting.playerID, batting.teamID]
    expr = t.groupby(t.teamID).mutate(
        team_avg=lambda d: d.G.mean(),
        demeaned_by_player=lambda d: d.G - d.G.mean(),
    )
    result = expr.compile()

    expected = dd.merge(batting_df,
                        players_df[['playerID']],
                        on='playerID',
                        how='left')[['G', 'playerID', 'teamID']]
    team_avg = expected.groupby('teamID').G.transform('mean')
    expected = expected.assign(team_avg=team_avg,
                               demeaned_by_player=lambda df: df.G - team_avg)

    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #5
0
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right,
                                      time_keyed_df1, time_keyed_df2):
    expr = time_keyed_left.asof_join(
        time_keyed_right,
        'time',
        by='key',
        tolerance=4 * ibis.interval(days=1))[time_keyed_left,
                                             time_keyed_right.other_value]
    context = (Timestamp('20170105'), Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # compare with asof_join of manually trimmed tables
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]]
    trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] -
                                 Timedelta(days=4)][
                                     time_keyed_df2['time'] < context[1]]
    expected = dd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('4D'),
    ).compute()
    tm.assert_frame_equal(result, expected)
Example #6
0
def test_intersect(client, df1, intersect_df2):
    t1 = client.table('df1')
    t2 = client.table('intersect_df2')
    expr = t1.intersect(t2)
    result = expr.compile()
    expected = df1.merge(intersect_df2, on=list(df1.columns))
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #7
0
def test_select_on_unambiguous_join(how, func, npartitions):
    df_t = dd.from_pandas(
        pd.DataFrame({
            'a0': [1, 2, 3],
            'b1': list("aab")
        }),
        npartitions=npartitions,
    )
    df_s = dd.from_pandas(
        pd.DataFrame({
            'a1': [2, 3, 4],
            'b2': list("abc")
        }),
        npartitions=npartitions,
    )
    con = ibis.dask.connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    method = getattr(t, f"{how}_join")
    join = method(s, t.b1 == s.b2)
    expected = dd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"],
                        how=how)[["a0", "a1"]]
    assert not expected.compute(scheduler='single-threaded').empty
    expr = func(join)
    result = expr.compile()
    tm.assert_frame_equal(
        result.compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #8
0
def test_select_on_unambiguous_asof_join(func, npartitions):
    df_t = dd.from_pandas(
        pd.DataFrame({
            'a0': [1, 2, 3],
            'b1': date_range("20180101", periods=3)
        }),
        npartitions=npartitions,
    )
    df_s = dd.from_pandas(
        pd.DataFrame({
            'a1': [2, 3, 4],
            'b2': date_range("20171230", periods=3)
        }),
        npartitions=npartitions,
    )
    con = ibis.dask.connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    join = t.asof_join(s, t.b1 == s.b2)
    expected = dd.merge_asof(df_t, df_s, left_on=["b1"],
                             right_on=["b2"])[["a0", "a1"]]
    assert not expected.compute(scheduler='single-threaded').empty
    expr = func(join)
    result = expr.compile()
    tm.assert_frame_equal(
        result.compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #9
0
def test_summary_numeric_group_by(batting, batting_df):
    expr = batting.groupby('teamID').G.summary()
    result = expr.execute()
    expected = (
        batting_df.groupby('teamID')
        .G.apply(
            lambda s: pd.DataFrame(
                {
                    'count': s.count(),
                    'nulls': s.isnull().sum(),
                    'min': s.min(),
                    'max': s.max(),
                    'sum': s.sum(),
                    'mean': s.mean(),
                    'approx_nunique': s.nunique(),
                },
                index=[0],
            )
        )
        .compute()
        .reset_index(level=1, drop=True)
        .reset_index()
    )
    columns = expected.columns

    tm.assert_frame_equal(result[columns], expected)
Example #10
0
def test_union_with_list_types(t, df, distinct):
    expr = t.union(t, distinct=distinct)
    result = expr.compile()
    expected = (
        df if distinct else dd.concat([df, df], axis=0, ignore_index=True)
    )
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #11
0
def test_mutate(t, df):
    expr = t.mutate(x=t.plain_int64 + 1, y=t.plain_int64 * 2)
    result = expr.compile()
    expected = df.assign(x=df.plain_int64 + 1, y=df.plain_int64 * 2)
    tm.assert_frame_equal(
        result[expected.columns].compute(), expected.compute()
    )
Example #12
0
def test_union(client, df1, distinct):
    t = client.table('df1')
    expr = t.union(t, distinct=distinct)
    result = expr.execute()
    expected = (df1 if distinct else dd.concat(
        [df1, df1], axis=0, ignore_index=True))
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #13
0
def test_multi_join_with_post_expression_filter(how, left, df1):
    lhs = left[['key', 'key2']]
    rhs = left[['key2', 'value']]
    rhs2 = left[['key2', 'value']].relabel({'value': 'value2'})

    joined = lhs.join(rhs, 'key2', how=how)
    projected = joined[lhs, rhs.value]
    filtered = projected[projected.value == 4]

    joined2 = filtered.join(rhs2, 'key2')
    projected2 = joined2[filtered.key, rhs2.value2]
    expr = projected2[projected2.value2 == 3]

    result = expr.compile()

    df1 = lhs.compile()
    df2 = rhs.compile()
    df3 = rhs2.compile()
    expected = dd.merge(df1, df2, on='key2', how=how)
    expected = expected.loc[expected.value == 4].reset_index(drop=True)
    expected = dd.merge(expected, df3, on='key2')[['key', 'value2']]
    expected = expected.loc[expected.value2 == 3].reset_index(drop=True)

    tm.assert_frame_equal(
        result.compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #14
0
def test_summary_numeric_group_by(batting, batting_df):
    expr = batting.groupby('teamID').G.summary()
    result = expr.execute()
    expected = (
        batting_df.groupby('teamID')
        .G.apply(
            lambda s: dd.from_pandas(
                pd.DataFrame(
                    dict(
                        count=s.count(),
                        nulls=s.isnull().sum(),
                        min=s.min(),
                        max=s.max(),
                        sum=s.sum(),
                        mean=s.mean(),
                        approx_nunique=s.nunique(),
                    ),
                    index=[0],
                ),
                npartitions=1,
            )
        )
        .reset_index(level=1, drop=True)
        .reset_index()
    )
    columns = expected.columns

    tm.assert_frame_equal(result[columns], expected)
Example #15
0
def test_array_collect(t, df):
    expr = t.group_by(
        t.dup_strings).aggregate(collected=t.float64_with_zeros.collect())
    result = expr.compile()
    expected = (df.groupby('dup_strings').float64_with_zeros.apply(
        list).reset_index().rename(
            columns={'float64_with_zeros': 'collected'}))
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #16
0
def test_array_repeat(t, df, n, mul):
    expr = t.projection([mul(t.array_of_strings, n).name('repeated')])
    result = expr.compile()
    expected = dd.from_pandas(
        pd.DataFrame({'repeated': df.array_of_strings * n}),
        npartitions=1,
    )
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #17
0
def test_group_concat(t, df):
    expr = t.groupby(
        t.dup_strings).aggregate(foo=t.plain_int64.group_concat(','))
    result = expr.execute()
    expected = (df.groupby('dup_strings').apply(lambda df: ','.join(
        df.plain_int64.astype(str))).reset_index().rename(columns={0: 'foo'}))
    tm.assert_frame_equal(result[expected.columns].compute(),
                          expected.compute())
Example #18
0
def test_left_binary_op_gb(t, df, op, argfunc):
    expr = t.groupby('dup_strings').aggregate(foo=op(
        *argfunc(t.float64_with_zeros)).sum())
    result = expr.execute()
    expected = (df.groupby('dup_strings').float64_with_zeros.apply(
        lambda s: op(*argfunc(s)).sum()).reset_index().rename(
            columns={'float64_with_zeros': 'foo'}))
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #19
0
def test_cast_on_group_by(t, df):
    expr = t.groupby(t.dup_strings).aggregate(
        casted=(t.float64_with_zeros == 0).cast('int64').sum())
    result = expr.execute()
    expected = (df.compute().groupby('dup_strings').float64_with_zeros.apply(
        lambda s: (s == 0).astype('int64').sum()).reset_index().rename(
            columns={'float64_with_zeros': 'casted'}))
    tm.assert_frame_equal(result.compute(), expected)
Example #20
0
def test_selection(t, df):
    expr = t[((t.plain_strings == 'a') | (t.plain_int64 == 3))
             & (t.dup_strings == 'd')]
    result = expr.execute()
    expected = df[((df.plain_strings == 'a') | (df.plain_int64 == 3))
                  & (df.dup_strings == 'd')].reset_index(drop=True)
    tm.assert_frame_equal(result[expected.columns].compute(),
                          expected.compute())
Example #21
0
def test_adjust_context_complete_shift(
    time_keyed_left,
    time_keyed_right,
    time_keyed_df1,
    time_keyed_df2,
):
    """Test `adjust_context` function that completely shifts the context.

    This results in an adjusted context that is NOT a subset of the
    original context. This is unlike an `adjust_context` function
    that only expands the context.

    See #3104
    """

    # Create a contrived `adjust_context` function for
    # CustomAsOfJoin to mock this.

    @adjust_context.register(CustomAsOfJoin)
    def adjust_context_custom_asof_join(
        op: ops.AsOfJoin,
        scope: Scope,
        timecontext: TimeContext,
    ) -> TimeContext:
        """Shifts both the begin and end in the same direction."""
        begin, end = timecontext
        timedelta = execute(op.tolerance)
        return (begin - timedelta, end - timedelta)

    expr = CustomAsOfJoin(
        left=time_keyed_left,
        right=time_keyed_right,
        predicates='time',
        by='key',
        tolerance=ibis.interval(days=4),
    ).to_expr()
    expr = expr[time_keyed_left, time_keyed_right.other_value]
    context = (Timestamp('20170101'), Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # Compare with asof_join of manually trimmed tables
    # Left table: No shift for context
    # Right table: Shift both begin and end of context by 4 days
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]
    ]
    trimmed_df2 = time_keyed_df2[
        time_keyed_df2['time'] >= context[0] - Timedelta(days=4)
    ][time_keyed_df2['time'] < context[1] - Timedelta(days=4)]
    expected = dd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('4D'),
    ).compute()

    tm.assert_frame_equal(result, expected)
Example #22
0
def test_struct_field_series_group_by_key(struct_table):
    t = struct_table
    expr = t.groupby(t.s['fruit']).aggregate(total=t.value.sum())
    result = expr.compile()
    expected = dd.from_pandas(
        pd.DataFrame([("apple", 1), ("pear", 5)], columns=["fruit", "total"]),
        npartitions=1,
    )
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #23
0
def test_join_project_left_table(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key, how=how)[left, right.key3]
    result = expr.compile()
    expected = dd.merge(df1, df2, how=how,
                        on='key')[list(left.columns) + ['key3']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #24
0
def test_sort_by(t, df, column, key, dask_by, dask_ascending):
    expr = t.sort_by(key(t, column))
    result = expr.compile()
    expected = (
        df.compute()
        .sort_values(dask_by(column), ascending=dask_ascending)
        .reset_index(drop=True)
    )
    tm.assert_frame_equal(result[expected.columns].compute(), expected)
Example #25
0
def test_join(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key,
                     how=how)[left, right.other_value, right.key3]
    result = expr.compile()
    expected = dd.merge(df1, df2, how=how, on='key')
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #26
0
def test_asof_join(time_left, time_right, time_df1, time_df2):
    expr = time_left.asof_join(time_right, 'time')[time_left,
                                                   time_right.other_value]
    result = expr.compile()
    expected = dd.merge_asof(time_df1, time_df2, on='time')
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #27
0
def test_weighted_average(t, df):
    expr = t.groupby(t.dup_strings).aggregate(
        avg=(t.plain_float64 * t.plain_int64).sum() / t.plain_int64.sum())
    result = expr.execute()
    expected = (df.groupby('dup_strings').apply(
        lambda df: (df.plain_int64 * df.plain_float64).sum(
        ) / df.plain_int64.sum()).reset_index().rename(columns={0: 'avg'}))
    tm.assert_frame_equal(result[expected.columns].compute(),
                          expected.compute())
Example #28
0
def test_cross_join_project_left_table(left, right, df1, df2):
    expr = left.cross_join(right)[left, right.key3]
    result = expr.compile()
    expected = dd.merge(
        df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy'
    ).rename(columns={'key_x': 'key'})[list(left.columns) + ['key3']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Example #29
0
def test_struct_field_series_group_by_value(struct_table):
    t = struct_table
    expr = t.groupby(t.key).aggregate(total=t.s['weight'].sum())
    result = expr.compile()
    # these are floats because we have a NULL value in the input data
    expected = dd.from_pandas(
        pd.DataFrame([("a", 0.0), ("b", 1.0)], columns=["key", "total"]),
        npartitions=1,
    )
    tm.assert_frame_equal(result.compute(), expected.compute())
Example #30
0
def test_join_with_post_expression_selection(how, left, right, df1, df2):
    join = left.join(right, left.key == right.key, how=how)
    expr = join[left.key, left.value, right.other_value]
    result = expr.compile()
    expected = dd.merge(df1, df2, on='key',
                        how=how)[['key', 'value', 'other_value']]
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )