def _distinct(__data, *args, _keep_all=False, **kwargs): if (args or kwargs) and _keep_all: raise NotImplementedError( "Distinct with variables specified in sql requires _keep_all = False" ) inner_sel = mutate(__data, **kwargs).last_op if kwargs else __data.last_op # TODO: this is copied from the df distinct version # cols dict below is used as ordered set cols = {simple_varname(x): True for x in args} cols.update(kwargs) if None in cols: raise KeyError("positional arguments must be simple column, " "e.g. _.colname or _['colname']") # use all columns by default if not cols: cols = list(inner_sel.columns.keys()) if not len(inner_sel._order_by_clause): # select distinct has to include any columns in the order by clause, # so can only safely modify existing statement when there's no order by sel_cols = lift_inner_cols(inner_sel) distinct_cols = [sel_cols[k] for k in cols] sel = inner_sel.with_only_columns(distinct_cols).distinct() else: # fallback to cte cte = inner_sel.alias() distinct_cols = [cte.columns[k] for k in cols] sel = sql.select(distinct_cols, from_obj=cte).distinct() return __data.append_op(sel)
def test_dply_grouped_mutate_of_agg_order(): # see issue #139 df = pd.DataFrame({ 'g': ['b', 'a', 'b'], 'x':[0, 1, 2] }) gdf = df.groupby('g') out = mutate(gdf, g_min = lambda d: d.x.min()) assert_frame_equal(ungroup(out), df.assign(g_min = [0, 1, 0]))
def _count(__data, *args, sort = False, wt = None, **kwargs): # TODO: if already col named n, use name nn, etc.. get logic from tidy.py if wt is not None: raise NotImplementedError("TODO") res_name = "n" # similar to filter verb, we need two select statements, # an inner one for derived cols, and outer to group by them # inner select ---- # holds any mutation style columns arg_names = [] for arg in args: name = simple_varname(arg) if name is None: raise NotImplementedError( "Count positional arguments must be single column name. " "Use a named argument to count using complex expressions." ) arg_names.append(name) tbl_inner = mutate(__data, **kwargs) sel_inner = tbl_inner.last_op group_cols = arg_names + list(kwargs) # outer select ---- # holds selected columns and tally (n) sel_inner_cte = sel_inner.alias() inner_cols = sel_inner_cte.columns sel_outer = sql.select(from_obj = sel_inner_cte) # apply any group vars from a group_by verb call first prev_group_cols = [inner_cols[k] for k in tbl_inner.group_by] if prev_group_cols: sel_outer.append_group_by(*prev_group_cols) sel_outer.append_column(*prev_group_cols) # now any defined in the count verb call for k in group_cols: sel_outer.append_group_by(inner_cols[k]) sel_outer.append_column(inner_cols[k]) count_col = sql.functions.count().label(res_name) sel_outer.append_column(count_col) # count is like summarize, so removes order_by return tbl_inner.append_op( sel_outer.order_by(count_col.desc()), order_by = tuple() )
def fast_mutate(__data, **kwargs): """Warning: this function is experimental""" # transform call trees, potentially bail out to slow method -------- new_vals = _transform_args(kwargs.values()) if new_vals is None: return mutate(__data, **kwargs) # perform fast method ---- out = __data.obj.copy() groupings = __data.grouper.groupings for name, expr in zip(kwargs, new_vals): res = grouped_eval(__data, expr) out[name] = res return out.groupby(groupings)
def _group_by(__data, *args, add = False, **kwargs): if kwargs: data = mutate(__data, **kwargs) else: data = __data cols = data.last_op.columns # put kwarg grouping vars last, so similar order to function call groups = tuple(simple_varname(arg) for arg in args) + tuple(kwargs) if None in groups: raise NotImplementedError("Complex expressions not supported in sql group_by") unmatched = set(groups) - set(cols.keys()) if unmatched: raise KeyError("group_by specifies columns missing from table: %s" %unmatched) if add: groups = ordered_union(data.group_by, groups) return data.copy(group_by = groups)
def test_dply_mutate_sym(df1): op_stars_1k = _.stars * 1000 out1 = mutate(df1, stars_1k=op_stars_1k) out2 = df1.assign(stars_1k=op_stars_1k) assert_frame_equal(out1, out2)