コード例 #1
0
ファイル: summarise.py プロジェクト: pwwang/datar
def _summarise_build(
    _data: DataFrame,
    *args: Any,
    **kwargs: Any,
) -> Tuple[Tibble, bool]:
    """Build summarise result"""
    if isinstance(_data, TibbleRowwise):
        outframe = _data.loc[:, _data.group_vars]
    else:
        outframe = regcall(group_keys, _data)
        if isinstance(_data, TibbleGrouped):
            grouped = _data._datar["grouped"]
            outframe = outframe.group_by(
                grouped.grouper.names,
                drop=grouped.observed,
                dropna=grouped.dropna,
                sort=grouped.sort,
            )

    all_ones = True
    context = ContextEvalRefCounts({"input_data": _data})
    for key, val in chain(enumerate(args), kwargs.items()):
        try:
            val = evaluate_expr(val, outframe, context)
        except KeyError:
            val = evaluate_expr(val, _data, context)

        if val is None:
            continue

        if isinstance(key, int):
            if isinstance(val, (DataFrame, Series)) and len(val) == 0:
                continue
            key = name_of(val)

        newframe = add_to_tibble(outframe, key, val, broadcast_tbl=True)
        if newframe is not outframe:
            # if it is broadcasted, then it should not be all ones.
            # since all ones don't need to broadcast
            all_ones = False

        outframe = newframe

    gvars = regcall(group_vars, _data)
    tmp_cols = [
        mcol
        for mcol in outframe.columns
        if mcol.startswith("_")
        and mcol in context.used_refs
        and mcol not in gvars
    ]
    outframe = regcall(ungroup, outframe)
    outframe = outframe[regcall(setdiff, outframe.columns, tmp_cols)]
    return outframe.reset_index(drop=True), all_ones
コード例 #2
0
def itemgetter(x, subscr, __args_raw=None):
    """Itemgetter as a function for verb

    In datar expression, we can do:
    >>> arr = [1,2,3]
    >>> tibble(x=2) >> mutate(y=arr[f.x])

    Since `arr[f.x]` won't compile. We need to use the `itemgetter` operator:
    >>> tibble(x=2) >> mutate(y=itemgetter(arr, f.x))

    Args:
        data: The data to be get items from
        subscr: The subscripts
    """
    # allow f[:2] to work
    subscr = evaluate_expr(subscr, x, Context.EVAL)
    if isinstance(subscr, Collection):
        subscr.expand(pool=x.size)

    if isinstance(subscr, Series):
        subscr = subscr.values

    out = x.iloc[subscr]
    if isinstance(__args_raw["x"], Series):
        return out
    return out.values
コード例 #3
0
    def evaluate(self, context=None):
        """Evaluate object with context"""
        if isinstance(context, Context):
            context = context.value

        if not self.fns:
            self.fns = [{"fn": lambda x: x}]

        ret = None
        # Instead of df.apply(), we can recycle groupby values and more
        for column in self.cols:
            for fn_info in self.fns:
                render_data = fn_info.copy()
                render_data["_col"] = column
                fn = render_data.pop("fn")
                name_format = self.names
                if not name_format:
                    name_format = ("{_col}_{_fn}"
                                   if "_fn" in render_data else "{_col}")

                name = name_format.format(**render_data)
                args = CurColumn.replace_args(self.args, column)
                kwargs = CurColumn.replace_kwargs(self.kwargs, column)
                if functype(fn) == "plain":
                    value = fn(
                        self.data[column],
                        *evaluate_expr(args, self.data, context),
                        **evaluate_expr(kwargs, self.data, context),
                    )
                else:
                    # use fn's own context
                    value = regcall(
                        fn,
                        self.data[column],
                        *args,
                        **kwargs,
                    )

                    # fast evaluation tried, if failed:
                    # will this happen? it fails when first argument
                    # cannot be evaluated
                    if isinstance(value, Function):  # pragma: no cover
                        value = value._pipda_eval(self.data, context)

                ret = add_to_tibble(ret, name, value, broadcast_tbl=True)

        return Tibble() if ret is None else ret
コード例 #4
0
def test_user_context():
    class MyContext(ContextEval):
        def eval_symbolic(self, data):
            return data * 2

    ce = MyContext()
    f = Symbolic()
    out = evaluate_expr(f.__len__(), [1, 2], ce)
    assert out == 4
コード例 #5
0
    def from_pairs(
        cls,
        names: Sequence[str],
        data: Sequence,
        _name_repair: Union[str, Callable] = "check_unique",
        _dtypes: Union["Dtype", Mapping[str, "Dtype"]] = None,
    ) -> "Tibble":
        """Construct a tibble with name-value pairs

        Instead of do `**kwargs`, this allows duplicated names

        Args:
            names: The names of the data to be construct a tibble
            data: The data to construct a tibble, must have the same length
                with the names
            _name_repair: How to repair names
            _dtypes: The dtypes for post conversion
        """
        from .broadcast import add_to_tibble

        if len(names) != len(data):
            raise ValueError(
                "Lengths of `names` and `values` are not the same.")
        names = repair_names(names, _name_repair)

        out = None
        for name, value in zip(names, data):
            value = evaluate_expr(value, out, Context.EVAL)
            # value = regcall(ungroup, value)
            if isinstance(value, Collection):
                value.expand()

            out = add_to_tibble(
                out,
                name,
                value,
                allow_dup_names=True,
                broadcast_tbl=True,
            )

        out = Tibble() if out is None else out

        if _dtypes in (None, False):
            return out
        if _dtypes is True:
            return out.convert_dtypes()

        apply_dtypes(out, _dtypes)
        return out
コード例 #6
0
def across(
    _data,
    *args,
    _names=None,
    _fn_context=Context.EVAL,
    _context=None,
    **kwargs,
):
    """Apply the same transformation to multiple columns

    The original API:
    https://dplyr.tidyverse.org/reference/across.html

    Examples:
        #
        >>> iris >> mutate(across(c(f.Sepal_Length, f.Sepal_Width), round))
            Sepal_Length  Sepal_Width  Petal_Length  Petal_Width    Species
               <float64>    <float64>     <float64>    <float64>   <object>
        0            5.0          4.0           1.4          0.2     setosa
        1            5.0          3.0           1.4          0.2     setosa
        ..           ...          ...           ...          ...        ...

        >>> iris >> group_by(f.Species) >> summarise(
        >>>     across(starts_with("Sepal"), mean)
        >>> )
              Species  Sepal_Length  Sepal_Width
             <object>     <float64>    <float64>
        0      setosa         5.006        3.428
        1  versicolor         5.936        2.770
        2   virginica         6.588        2.974

    Args:
        _data: The dataframe.
        *args: If given, the first 2 elements should be columns and functions
            apply to each of the selected columns. The rest of them will be
            the arguments for the functions.
        _names: A glue specification that describes how to name
            the output columns. This can use `{_col}` to stand for the
            selected column name, and `{_fn}` to stand for the name of
            the function being applied.
            The default (None) is equivalent to `{_col}` for the
            single function case and `{_col}_{_fn}` for the case where
            a list is used for _fns. In such a case, `{_fn}` is 0-based.
            To use 1-based index, use `{_fn1}`
        _fn_context: Defines the context to evaluate the arguments for functions
            if they are plain functions.
            Note that registered functions will use its own context
        **kwargs: Keyword arguments for the functions

    Returns:
        A dataframe with one column for each column and each function.
    """
    _data = _context.meta.get("input_data", _data)

    if not args:
        args = (None, None)
    elif len(args) == 1:
        args = (args[0], None)
    _cols, _fns, *args = args
    _cols = evaluate_expr(_cols, _data, Context.SELECT)

    return Across(
        _data,
        _cols,
        _fns,
        _names,
        args,
        kwargs,
    ).evaluate(_fn_context)
コード例 #7
0
ファイル: mutate.py プロジェクト: pwwang/datar
def mutate(
    _data,
    *args,
    _keep="all",
    _before=None,
    _after=None,
    **kwargs,
):
    """Adds new variables and preserves existing ones

    The original API:
    https://dplyr.tidyverse.org/reference/mutate.html

    Args:
        _data: A data frame
        _keep: allows you to control which columns from _data are retained
            in the output:
            - "all", the default, retains all variables.
            - "used" keeps any variables used to make new variables;
              it's useful for checking your work as it displays inputs and
              outputs side-by-side.
            - "unused" keeps only existing variables not used to make new
                variables.
            - "none", only keeps grouping keys (like transmute()).
        _before: and
        _after: Optionally, control where new columns should appear
            (the default is to add to the right hand side).
            See relocate() for more details.
        *args: and
        **kwargs: Name-value pairs. The name gives the name of the column
            in the output. The value can be:
            - A vector of length 1, which will be recycled to the correct
                length.
            - A vector the same length as the current group (or the whole
                data frame if ungrouped).
            - None to remove the column

    Returns:
        An object of the same type as _data. The output has the following
        properties:
        - Rows are not affected.
        - Existing columns will be preserved according to the _keep
            argument. New columns will be placed according to the
            _before and _after arguments. If _keep = "none"
            (as in transmute()), the output order is determined only
            by ..., not the order of existing columns.
        - Columns given value None will be removed
        - Groups will be recomputed if a grouping variable is mutated.
        - Data frame attributes are preserved.
    """
    keep = arg_match(_keep, "_keep", ["all", "unused", "used", "none"])
    gvars = regcall(group_vars, _data)
    data = regcall(as_tibble, _data.copy())
    all_columns = data.columns

    mutated_cols = []
    context = ContextEvalRefCounts()
    for val in args:
        if (isinstance(val, (ReferenceItem, ReferenceAttr))
                and val._pipda_level == 1 and val._pipda_ref in data):
            mutated_cols.append(val._pipda_ref)
            continue

        bkup_name = name_of(val)
        val = evaluate_expr(val, data, context)
        if val is None:
            continue

        if isinstance(val, DataFrame):
            mutated_cols.extend(val.columns)
            data = add_to_tibble(data, None, val, broadcast_tbl=False)
        else:
            key = name_of(val) or bkup_name
            mutated_cols.append(key)
            data = add_to_tibble(data, key, val, broadcast_tbl=False)

    for key, val in kwargs.items():
        val = evaluate_expr(val, data, context)
        if val is None:
            with suppress(KeyError):
                data.drop(columns=[key], inplace=True)
        else:
            data = add_to_tibble(data, key, val, broadcast_tbl=False)
            if isinstance(val, DataFrame):
                mutated_cols.extend({f"{key}${col}" for col in val.columns})
            else:
                mutated_cols.append(key)

    # names start with "_" are temporary names if they are used
    tmp_cols = [
        mcol for mcol in mutated_cols if mcol.startswith("_")
        and mcol in context.used_refs and mcol not in _data.columns
    ]
    # columns can be removed later
    # df >> mutate(Series(1, name="z"), z=None)
    mutated_cols = regcall(intersect, mutated_cols, data.columns)
    mutated_cols = regcall(setdiff, mutated_cols, tmp_cols)
    # new cols always at last
    # data.columns.difference() does not keep order

    data = data.loc[:, regcall(setdiff, data.columns, tmp_cols)]

    if _before is not None or _after is not None:
        new_cols = regcall(setdiff, mutated_cols, _data.columns)
        data = regcall(
            relocate,
            data,
            *new_cols,
            _before=_before,
            _after=_after,
        )

    if keep == "all":
        keep = data.columns
    elif keep == "unused":
        used = list(context.used_refs)
        unused = regcall(setdiff, all_columns, used)
        keep = regcall(intersect, data.columns, c(gvars, unused, mutated_cols))
    elif keep == "used":
        used = list(context.used_refs)
        keep = regcall(intersect, data.columns, c(gvars, used, mutated_cols))
    else:  # keep == 'none':
        keep = regcall(
            union,
            regcall(setdiff, gvars, mutated_cols),
            regcall(intersect, mutated_cols, data.columns),
        )

    data = data[keep]
    # redo grouping if original columns changed
    # so we don't have discripency on
    # df.x.obj when df is grouped
    if intersect(_data.columns, mutated_cols).size > 0:
        data = reconstruct_tibble(_data, data)

    # used for group_by
    data._datar["mutated_cols"] = mutated_cols
    return data
コード例 #8
0
ファイル: collections.py プロジェクト: pwwang/datar
 def _pipda_eval(self, data: Any, context: ContextAnnoType) -> Any:
     """Defines how the object should be evaluated when evaluated by
     pipda's evaluation"""
     self.elems = evaluate_expr(self.elems, data, context)
     return self