Esempio n. 1
0
def _distinct(__data, *args, _keep_all=False, **kwargs):
    if (args or kwargs) and _keep_all:
        raise NotImplementedError(
            "Distinct with variables specified in sql requires _keep_all = False"
        )

    inner_sel = mutate(__data, **kwargs).last_op if kwargs else __data.last_op

    # TODO: this is copied from the df distinct version
    # cols dict below is used as ordered set
    cols = {simple_varname(x): True for x in args}
    cols.update(kwargs)

    if None in cols:
        raise KeyError("positional arguments must be simple column, "
                       "e.g. _.colname or _['colname']")

    # use all columns by default
    if not cols:
        cols = list(inner_sel.columns.keys())

    if not len(inner_sel._order_by_clause):
        # select distinct has to include any columns in the order by clause,
        # so can only safely modify existing statement when there's no order by
        sel_cols = lift_inner_cols(inner_sel)
        distinct_cols = [sel_cols[k] for k in cols]
        sel = inner_sel.with_only_columns(distinct_cols).distinct()
    else:
        # fallback to cte
        cte = inner_sel.alias()
        distinct_cols = [cte.columns[k] for k in cols]
        sel = sql.select(distinct_cols, from_obj=cte).distinct()

    return __data.append_op(sel)
Esempio n. 2
0
def test_arranges_back_to_back(backend):
    data = data_frame(x=range(1, 5), g=[1, 1, 2, 2])
    dfs = backend.load_df(data)

    lazy_tbl = dfs >> arrange(_.x) >> arrange(_.g)
    order_by_vars = tuple(simple_varname(call) for call in lazy_tbl.order_by)

    assert order_by_vars == ("x", "g")
    assert [c.name for c in lazy_tbl.last_op._order_by_clause] == ["x", "g"]
Esempio n. 3
0
def _count(__data, *args, sort = False, wt = None, **kwargs):
    # TODO: if already col named n, use name nn, etc.. get logic from tidy.py
    if wt is not None:
        raise NotImplementedError("TODO")

    res_name = "n"
    # similar to filter verb, we need two select statements,
    # an inner one for derived cols, and outer to group by them

    # inner select ----
    # holds any mutation style columns
    arg_names = []
    for arg in args:
        name = simple_varname(arg)
        if name is None:
            raise NotImplementedError(
                    "Count positional arguments must be single column name. "
                    "Use a named argument to count using complex expressions."
                    )
        arg_names.append(name)

    tbl_inner = mutate(__data, **kwargs)
    sel_inner = tbl_inner.last_op
    group_cols = arg_names + list(kwargs)

    # outer select ----
    # holds selected columns and tally (n)
    sel_inner_cte = sel_inner.alias()
    inner_cols = sel_inner_cte.columns
    sel_outer = sql.select(from_obj = sel_inner_cte)

    # apply any group vars from a group_by verb call first
    prev_group_cols = [inner_cols[k] for k in tbl_inner.group_by]
    if prev_group_cols:
        sel_outer.append_group_by(*prev_group_cols)
        sel_outer.append_column(*prev_group_cols)

    # now any defined in the count verb call
    for k in group_cols:
        sel_outer.append_group_by(inner_cols[k])
        sel_outer.append_column(inner_cols[k])

    count_col = sql.functions.count().label(res_name)
    sel_outer.append_column(count_col)

    # count is like summarize, so removes order_by
    return tbl_inner.append_op(
            sel_outer.order_by(count_col.desc()),
            order_by = tuple()
            )
Esempio n. 4
0
def _count(__data, *args, sort=False, wt=None, **kwargs):
    # TODO: if already col named n, use name nn, etc.. get logic from tidy.py
    if kwargs:
        raise NotImplementedError("TODO")

    if wt is not None:
        raise NotImplementedError("TODO")

    # similar to filter verb, we need two select statements,
    # an inner one for derived cols, and outer to group by them
    sel = __data.last_op.alias()
    sel_inner = sql.select([sel], from_obj=sel)

    # inner select ----
    # holds any mutation style columns
    group_cols = []
    for arg in args:
        col_name = simple_varname(arg)
        if col_name is None:
            # evaluate call
            col_expr = arg(sel.columns) if callable(arg) else arg

            # compile, so we can use the expr as its name (e.g. "id + 1")
            col_name = str(compile_el(__data, col_expr))
            label = col_expr.label(col_name)
            sel_inner.append_column(label)

        group_cols.append(col_name)

    # outer select ----
    # holds selected columns and tally (n)
    sel_inner_cte = sel_inner.alias()
    inner_cols = sel_inner_cte.columns
    sel_outer = sql.select(from_obj=sel_inner_cte)

    # apply any group vars from a group_by verb call first
    prev_group_cols = [inner_cols[k] for k in __data.group_by]
    if prev_group_cols:
        sel_outer.append_group_by(*prev_group_cols)
        sel_outer.append_column(*prev_group_cols)

    # now any defined in the count verb call
    for k in group_cols:
        sel_outer.append_group_by(inner_cols[k])
        sel_outer.append_column(inner_cols[k])

    sel_outer.append_column(sql.functions.count().label("n"))

    return __data.append_op(sel_outer)
Esempio n. 5
0
def _rename(__data, **kwargs):
    sel = __data.last_op
    columns = lift_inner_cols(sel)

    # old_keys uses dict as ordered set
    old_to_new = {simple_varname(v):k for k,v in kwargs.items()}
    
    if None in old_to_new:
        raise KeyError("positional arguments must be simple column, "
                        "e.g. _.colname or _['colname']"
                        )

    labs = [c.label(old_to_new[k]) if k in old_to_new else c for k,c in columns.items()]

    new_sel = sel.with_only_columns(labs)

    return __data.append_op(new_sel)
Esempio n. 6
0
def _group_by(__data, *args, add = False, **kwargs):
    if kwargs:
        data = mutate(__data, **kwargs)
    else:
        data = __data

    cols = data.last_op.columns

    # put kwarg grouping vars last, so similar order to function call
    groups =  tuple(simple_varname(arg) for arg in args) + tuple(kwargs)
    if None in groups:
        raise NotImplementedError("Complex expressions not supported in sql group_by")

    unmatched = set(groups) - set(cols.keys())
    if unmatched:
        raise KeyError("group_by specifies columns missing from table: %s" %unmatched)

    if add:
        groups = ordered_union(data.group_by, groups)

    return data.copy(group_by = groups)