def _summarise_build( _data: DataFrame, *args: Any, **kwargs: Any, ) -> Tuple[Tibble, bool]: """Build summarise result""" if isinstance(_data, TibbleRowwise): outframe = _data.loc[:, _data.group_vars] else: outframe = regcall(group_keys, _data) if isinstance(_data, TibbleGrouped): grouped = _data._datar["grouped"] outframe = outframe.group_by( grouped.grouper.names, drop=grouped.observed, dropna=grouped.dropna, sort=grouped.sort, ) all_ones = True context = ContextEvalRefCounts({"input_data": _data}) for key, val in chain(enumerate(args), kwargs.items()): try: val = evaluate_expr(val, outframe, context) except KeyError: val = evaluate_expr(val, _data, context) if val is None: continue if isinstance(key, int): if isinstance(val, (DataFrame, Series)) and len(val) == 0: continue key = name_of(val) newframe = add_to_tibble(outframe, key, val, broadcast_tbl=True) if newframe is not outframe: # if it is broadcasted, then it should not be all ones. # since all ones don't need to broadcast all_ones = False outframe = newframe gvars = regcall(group_vars, _data) tmp_cols = [ mcol for mcol in outframe.columns if mcol.startswith("_") and mcol in context.used_refs and mcol not in gvars ] outframe = regcall(ungroup, outframe) outframe = outframe[regcall(setdiff, outframe.columns, tmp_cols)] return outframe.reset_index(drop=True), all_ones
def itemgetter(x, subscr, __args_raw=None): """Itemgetter as a function for verb In datar expression, we can do: >>> arr = [1,2,3] >>> tibble(x=2) >> mutate(y=arr[f.x]) Since `arr[f.x]` won't compile. We need to use the `itemgetter` operator: >>> tibble(x=2) >> mutate(y=itemgetter(arr, f.x)) Args: data: The data to be get items from subscr: The subscripts """ # allow f[:2] to work subscr = evaluate_expr(subscr, x, Context.EVAL) if isinstance(subscr, Collection): subscr.expand(pool=x.size) if isinstance(subscr, Series): subscr = subscr.values out = x.iloc[subscr] if isinstance(__args_raw["x"], Series): return out return out.values
def evaluate(self, context=None): """Evaluate object with context""" if isinstance(context, Context): context = context.value if not self.fns: self.fns = [{"fn": lambda x: x}] ret = None # Instead of df.apply(), we can recycle groupby values and more for column in self.cols: for fn_info in self.fns: render_data = fn_info.copy() render_data["_col"] = column fn = render_data.pop("fn") name_format = self.names if not name_format: name_format = ("{_col}_{_fn}" if "_fn" in render_data else "{_col}") name = name_format.format(**render_data) args = CurColumn.replace_args(self.args, column) kwargs = CurColumn.replace_kwargs(self.kwargs, column) if functype(fn) == "plain": value = fn( self.data[column], *evaluate_expr(args, self.data, context), **evaluate_expr(kwargs, self.data, context), ) else: # use fn's own context value = regcall( fn, self.data[column], *args, **kwargs, ) # fast evaluation tried, if failed: # will this happen? it fails when first argument # cannot be evaluated if isinstance(value, Function): # pragma: no cover value = value._pipda_eval(self.data, context) ret = add_to_tibble(ret, name, value, broadcast_tbl=True) return Tibble() if ret is None else ret
def test_user_context(): class MyContext(ContextEval): def eval_symbolic(self, data): return data * 2 ce = MyContext() f = Symbolic() out = evaluate_expr(f.__len__(), [1, 2], ce) assert out == 4
def from_pairs( cls, names: Sequence[str], data: Sequence, _name_repair: Union[str, Callable] = "check_unique", _dtypes: Union["Dtype", Mapping[str, "Dtype"]] = None, ) -> "Tibble": """Construct a tibble with name-value pairs Instead of do `**kwargs`, this allows duplicated names Args: names: The names of the data to be construct a tibble data: The data to construct a tibble, must have the same length with the names _name_repair: How to repair names _dtypes: The dtypes for post conversion """ from .broadcast import add_to_tibble if len(names) != len(data): raise ValueError( "Lengths of `names` and `values` are not the same.") names = repair_names(names, _name_repair) out = None for name, value in zip(names, data): value = evaluate_expr(value, out, Context.EVAL) # value = regcall(ungroup, value) if isinstance(value, Collection): value.expand() out = add_to_tibble( out, name, value, allow_dup_names=True, broadcast_tbl=True, ) out = Tibble() if out is None else out if _dtypes in (None, False): return out if _dtypes is True: return out.convert_dtypes() apply_dtypes(out, _dtypes) return out
def across( _data, *args, _names=None, _fn_context=Context.EVAL, _context=None, **kwargs, ): """Apply the same transformation to multiple columns The original API: https://dplyr.tidyverse.org/reference/across.html Examples: # >>> iris >> mutate(across(c(f.Sepal_Length, f.Sepal_Width), round)) Sepal_Length Sepal_Width Petal_Length Petal_Width Species <float64> <float64> <float64> <float64> <object> 0 5.0 4.0 1.4 0.2 setosa 1 5.0 3.0 1.4 0.2 setosa .. ... ... ... ... ... >>> iris >> group_by(f.Species) >> summarise( >>> across(starts_with("Sepal"), mean) >>> ) Species Sepal_Length Sepal_Width <object> <float64> <float64> 0 setosa 5.006 3.428 1 versicolor 5.936 2.770 2 virginica 6.588 2.974 Args: _data: The dataframe. *args: If given, the first 2 elements should be columns and functions apply to each of the selected columns. The rest of them will be the arguments for the functions. _names: A glue specification that describes how to name the output columns. This can use `{_col}` to stand for the selected column name, and `{_fn}` to stand for the name of the function being applied. The default (None) is equivalent to `{_col}` for the single function case and `{_col}_{_fn}` for the case where a list is used for _fns. In such a case, `{_fn}` is 0-based. To use 1-based index, use `{_fn1}` _fn_context: Defines the context to evaluate the arguments for functions if they are plain functions. Note that registered functions will use its own context **kwargs: Keyword arguments for the functions Returns: A dataframe with one column for each column and each function. """ _data = _context.meta.get("input_data", _data) if not args: args = (None, None) elif len(args) == 1: args = (args[0], None) _cols, _fns, *args = args _cols = evaluate_expr(_cols, _data, Context.SELECT) return Across( _data, _cols, _fns, _names, args, kwargs, ).evaluate(_fn_context)
def mutate( _data, *args, _keep="all", _before=None, _after=None, **kwargs, ): """Adds new variables and preserves existing ones The original API: https://dplyr.tidyverse.org/reference/mutate.html Args: _data: A data frame _keep: allows you to control which columns from _data are retained in the output: - "all", the default, retains all variables. - "used" keeps any variables used to make new variables; it's useful for checking your work as it displays inputs and outputs side-by-side. - "unused" keeps only existing variables not used to make new variables. - "none", only keeps grouping keys (like transmute()). _before: and _after: Optionally, control where new columns should appear (the default is to add to the right hand side). See relocate() for more details. *args: and **kwargs: Name-value pairs. The name gives the name of the column in the output. The value can be: - A vector of length 1, which will be recycled to the correct length. - A vector the same length as the current group (or the whole data frame if ungrouped). - None to remove the column Returns: An object of the same type as _data. The output has the following properties: - Rows are not affected. - Existing columns will be preserved according to the _keep argument. New columns will be placed according to the _before and _after arguments. If _keep = "none" (as in transmute()), the output order is determined only by ..., not the order of existing columns. - Columns given value None will be removed - Groups will be recomputed if a grouping variable is mutated. - Data frame attributes are preserved. """ keep = arg_match(_keep, "_keep", ["all", "unused", "used", "none"]) gvars = regcall(group_vars, _data) data = regcall(as_tibble, _data.copy()) all_columns = data.columns mutated_cols = [] context = ContextEvalRefCounts() for val in args: if (isinstance(val, (ReferenceItem, ReferenceAttr)) and val._pipda_level == 1 and val._pipda_ref in data): mutated_cols.append(val._pipda_ref) continue bkup_name = name_of(val) val = evaluate_expr(val, data, context) if val is None: continue if isinstance(val, DataFrame): mutated_cols.extend(val.columns) data = add_to_tibble(data, None, val, broadcast_tbl=False) else: key = name_of(val) or bkup_name mutated_cols.append(key) data = add_to_tibble(data, key, val, broadcast_tbl=False) for key, val in kwargs.items(): val = evaluate_expr(val, data, context) if val is None: with suppress(KeyError): data.drop(columns=[key], inplace=True) else: data = add_to_tibble(data, key, val, broadcast_tbl=False) if isinstance(val, DataFrame): mutated_cols.extend({f"{key}${col}" for col in val.columns}) else: mutated_cols.append(key) # names start with "_" are temporary names if they are used tmp_cols = [ mcol for mcol in mutated_cols if mcol.startswith("_") and mcol in context.used_refs and mcol not in _data.columns ] # columns can be removed later # df >> mutate(Series(1, name="z"), z=None) mutated_cols = regcall(intersect, mutated_cols, data.columns) mutated_cols = regcall(setdiff, mutated_cols, tmp_cols) # new cols always at last # data.columns.difference() does not keep order data = data.loc[:, regcall(setdiff, data.columns, tmp_cols)] if _before is not None or _after is not None: new_cols = regcall(setdiff, mutated_cols, _data.columns) data = regcall( relocate, data, *new_cols, _before=_before, _after=_after, ) if keep == "all": keep = data.columns elif keep == "unused": used = list(context.used_refs) unused = regcall(setdiff, all_columns, used) keep = regcall(intersect, data.columns, c(gvars, unused, mutated_cols)) elif keep == "used": used = list(context.used_refs) keep = regcall(intersect, data.columns, c(gvars, used, mutated_cols)) else: # keep == 'none': keep = regcall( union, regcall(setdiff, gvars, mutated_cols), regcall(intersect, mutated_cols, data.columns), ) data = data[keep] # redo grouping if original columns changed # so we don't have discripency on # df.x.obj when df is grouped if intersect(_data.columns, mutated_cols).size > 0: data = reconstruct_tibble(_data, data) # used for group_by data._datar["mutated_cols"] = mutated_cols return data
def _pipda_eval(self, data: Any, context: ContextAnnoType) -> Any: """Defines how the object should be evaluated when evaluated by pipda's evaluation""" self.elems = evaluate_expr(self.elems, data, context) return self