Esempio n. 1
0
def test_join_on_list_arg(backend):
    # TODO: how to validate how cols are being matched up?
    data = DF1.assign(jj=lambda d: d.ii)
    df_a = backend.load_df(data)
    df_b = backend.load_df(DF2.assign(jj=lambda d: d.ii))
    out = inner_join(df_a, df_b, ["ii", "jj"]) >> collect()

    assert_frame_sort_equal(out, data.iloc[:2, :].assign(y=["a", "b"]))
Esempio n. 2
0
def test_join_on_same_col_multiple_times():
    data = data_frame(ii=[1, 2, 3], jj=[1, 2, 9])
    df_a = backend.load_df(data)
    df_b = backend.load_df(data_frame(ii=[1, 2, 3]))

    out = inner_join(df_a, df_b, {("ii", "jj"): "ii"}) >> collect()
    # keeps all but last row
    assert_frame_sort_equal(out, data.iloc[:2, ])
Esempio n. 3
0
def test_join_suffixes_dupe_names(df1):
    out = inner_join(df1, df1, {"ii": "ii"}) >> collect()
    non_index_cols = DF1.columns[DF1.columns != "ii"]
    assert all((non_index_cols + "_x").isin(out))
    assert all((non_index_cols + "_y").isin(out))
Esempio n. 4
0
def test_join_on_missing_col(df1, df2):
    with pytest.raises(KeyError):
        inner_join(df1, df2, {"ABCDEF": "ii"})

    with pytest.raises(KeyError):
        inner_join(df1, df2, {"ii": "ABCDEF"})
Esempio n. 5
0
def test_join_on_str_arg(df1, df2):
    out = inner_join(df1, df2, "ii") >> collect()

    target = DF1.iloc[:2, ].assign(y=["a", "b"])
    assert_frame_sort_equal(out, target)
Esempio n. 6
0
def test_join_diff_vars_keeps_left(backend, df1, df2_jj):
    out = inner_join(df1, df2_jj, {"ii": "jj"}) >> collect()

    assert out.columns.tolist() == ["ii", "x", "y"]
Esempio n. 7
0
def test_basic_inner_join(df1, df2):
    out = inner_join(df1, df2, {"ii": "ii"}) >> collect()
    target = DF1.iloc[:2, :].assign(y=["a", "b"])
    assert_frame_sort_equal(out, target)
Esempio n. 8
0
def test_inner_join_arrange(backend, df1, df2):
    # NOTE: joins are free to scramble order in SQL. TODO: check dplyr
    joined = inner_join(arrange(df1, _.ii), df2, on="ii")

    assert joined.order_by == tuple()
Esempio n. 9
0
def after_join(
        lhs, rhs,
        by_time, by_user,
        mode = "inner",
        type = "first-firstafter",
        max_gap = None,
        min_gap = None,
        gap_col = None,
        suffix = ("_x", "_y")
        ):

    if max_gap is not None or min_gap is not None or gap_col is not None:
        raise NotImplementedError("max_gap, min_gap, gap_col not implemented")

    # Get type of join for both tables, from e.g. "first-firstafter"
    type_lhs, type_rhs = type.split("-")

    # Convert join keys to dictionary form
    by_time_x, by_time_y = _get_key_tuple(by_time)
    by_user_x, by_user_y = _get_key_tuple(by_user)

    # mutate in row_number ----
    lhs_i = (lhs
            >> arrange(_[by_user_x], _[by_time_x])
            >> mutate(__idx = row_number(_))
            >> distinct_events(by_time_x, by_user_x, type_lhs)
            )

    rhs_i = (rhs
            >> arrange(_[by_user_y], _[by_time_y])
            >> mutate(__idy = row_number(_))
            >> distinct_events(by_time_y, by_user_y, type_rhs)
            )

    # Handle when time column is in the other table
    if by_time_x == by_time_y:
        # TODO: don't use implicit join suffix below
        pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y"
    else:
        pair_time_x, pair_time_y = by_time_x, by_time_y

    # Inner join by user, filter by time
    pairs = filter(
            inner_join(lhs_i, rhs_i, by_user),
            _[pair_time_x] <= _[pair_time_y]
            )

    # TODO: firstwithin
    if type_lhs in ["firstwithin", "lastbefore"]:
        raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs)

    # Handle firstafter by subsetting
    if type_rhs == "firstafter":
        pairs = (pairs
                >> arrange(_[pair_time_y])
                >> group_by(_.__idx)
                >> filter(row_number(_) == 1)
                >> ungroup()
                )


    distinct_pairs = select(pairs, _.__idx, _.__idy)


    if mode in ["inner", "left", "right", "full", "outer"]:
        by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")])
        res = (lhs_i
                >> join(_, distinct_pairs, on = "__idx", how = mode) 
                # TODO: suffix arg
                >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix)
                >> select(-_["__idx", "__idy"])
                )
    elif mode in ["semi", "anti"]:
        join_func = semi_join if mode == "semi" else anti_join
        res = (lhs_i
                >> join_func(_, distinct_pairs, "__idx")
                >> select(-_["__idx", "__idy"])
                )

    else:
        raise ValueError("mode not recognized: %s" %mode)

    return res