コード例 #1
0
ファイル: testing.py プロジェクト: pola-rs/polars
def assert_frame_equal_local_categoricals(df_a: pli.DataFrame,
                                          df_b: pli.DataFrame) -> None:
    assert df_a.schema == df_b.schema
    cat_to_str = pli.col(Categorical).cast(str)
    assert df_a.with_column(cat_to_str).frame_equal(
        df_b.with_column(cat_to_str))
    cat_to_phys = pli.col(Categorical).to_physical()
    assert df_a.with_column(cat_to_phys).frame_equal(
        df_b.with_column(cat_to_phys))
コード例 #2
0
ファイル: lazy_functions.py プロジェクト: pola-rs/polars
def arg_where(condition: pli.Expr | pli.Series,
              eager: bool = False) -> pli.Expr | pli.Series:
    """
    Return indices where `condition` evaluates `True`.

    Parameters
    ----------
    condition
        Boolean expression to evaluate

    Examples
    --------
    >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
    >>> df.select(
    ...     [
    ...         pl.arg_where(pl.col("a") % 2 == 0),
    ...     ]
    ... ).to_series()
    shape: (2,)
    Series: 'a' [u32]
    [
        1
        3
    ]
    """
    if eager:
        if not isinstance(condition, pli.Series):
            raise ValueError(
                f"expected 'Series' in 'arg_where' if 'eager=True', got {type(condition)}"
            )
        return (condition.to_frame().select(arg_where(pli.col(
            condition.name))).to_series())
    else:
        condition = pli.expr_to_lit_or_expr(condition, str_to_lit=True)
        return pli.wrap_expr(py_arg_where(condition._pyexpr))
コード例 #3
0
ファイル: lazy_frame.py プロジェクト: ghuls/polars
def _prepare_groupby_inputs(
    by: Optional[Union[str, List[str], "pli.Expr", List["pli.Expr"]]],
) -> List["PyExpr"]:
    if isinstance(by, list):
        new_by = []
        for e in by:
            if isinstance(e, str):
                e = pli.col(e)
            new_by.append(e._pyexpr)
    elif isinstance(by, str):
        new_by = [pli.col(by)._pyexpr]
    elif isinstance(by, pli.Expr):
        new_by = [by._pyexpr]
    elif by is None:
        return []
    return new_by
コード例 #4
0
ファイル: lazy_frame.py プロジェクト: ghuls/polars
    def filter(self, predicate: Union["pli.Expr", str]) -> "LazyFrame":
        """
        Filter the rows in the DataFrame based on a predicate expression.

        Parameters
        ----------
        predicate
            Expression that evaluates to a boolean Series.

        Examples
        --------

        >>> lf = pl.DataFrame(
        ...     {
        ...         "foo": [1, 2, 3],
        ...         "bar": [6, 7, 8],
        ...         "ham": ["a", "b", "c"],
        ...     }
        ... ).lazy()

        Filter on one condition:

        >>> lf.filter(pl.col("foo") < 3).collect()
        shape: (2, 3)
        ┌─────┬─────┬─────┐
        │ foo ┆ bar ┆ ham │
        │ --- ┆ --- ┆ --- │
        │ i64 ┆ i64 ┆ str │
        ╞═════╪═════╪═════╡
        │ 1   ┆ 6   ┆ a   │
        ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
        │ 2   ┆ 7   ┆ b   │
        └─────┴─────┴─────┘

        Filter on multiple conditions:

        >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect()
        shape: (1, 3)
        ┌─────┬─────┬─────┐
        │ foo ┆ bar ┆ ham │
        │ --- ┆ --- ┆ --- │
        │ i64 ┆ i64 ┆ str │
        ╞═════╪═════╪═════╡
        │ 1   ┆ 6   ┆ a   │
        └─────┴─────┴─────┘

        """
        if isinstance(predicate, str):
            predicate = pli.col(predicate)
        return wrap_ldf(self._ldf.filter(predicate._pyexpr))
コード例 #5
0
def _post_apply_columns(pydf: PyDataFrame,
                        columns: ColumnsType) -> PyDataFrame:
    """
    Apply 'columns' param _after_ PyDataFrame creation (if no alternative).
    """
    pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
    columns, dtypes = _unpack_columns(columns or pydf_columns)
    if columns != pydf_columns:
        pydf.set_column_names(columns)

    column_casts = [
        pli.col(col).cast(dtypes[col])._pyexpr for i, col in enumerate(columns)
        if col in dtypes and dtypes[col] != pydf_dtypes[i]
    ]
    if column_casts:
        pydf = pydf.lazy().with_columns(column_casts).collect()
    return pydf
コード例 #6
0
ファイル: testing.py プロジェクト: pola-rs/polars
def verify_series_and_expr_api(input: pli.Series, expected: pli.Series | None,
                               op: str, *args: Any, **kwargs: Any) -> None:
    """
    Small helper function to test element-wise functions for both the series and expressions api.

    Examples
    --------
    >>> s = pl.Series([1, 3, 2])
    >>> expected = pl.Series([1, 2, 3])
    >>> verify_series_and_expr_api(s, expected, "sort")
    """
    expr = _getattr_multi(pli.col("*"), op)(*args, **kwargs)
    result_expr: pli.Series = input.to_frame().select(
        expr)[:, 0]  # type: ignore[assignment]
    result_series = _getattr_multi(input, op)(*args, **kwargs)
    if expected is None:
        assert_series_equal(result_series, result_expr)
    else:
        assert_series_equal(result_expr, expected)
        assert_series_equal(result_series, expected)
コード例 #7
0
ファイル: functions.py プロジェクト: pola-rs/polars
def cut(
    s: pli.Series,
    bins: list[float],
    labels: Optional[list[str]] = None,
    break_point_label: str = "break_point",
    category_label: str = "category",
) -> pli.DataFrame:
    """
    Bin values into discrete values

    .. warning::
        This function is experimental and might change without it being considered a breaking change.

    Parameters
    ----------
    s
        Series to bin.
    bins
        Bins to create.
    labels
        Labels to assign to the bins. If given the length of labels must be len(bins) + 1.
    break_point_label
        Name given to the breakpoint column.
    category_label
        Name given to the category column.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
    >>> pl.cut(a, bins=[-1, 1])
    shape: (12, 3)
    ┌──────┬─────────────┬──────────────┐
    │ a    ┆ break_point ┆ category     │
    │ ---  ┆ ---         ┆ ---          │
    │ f64  ┆ f64         ┆ cat          │
    ╞══════╪═════════════╪══════════════╡
    │ -3.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -1.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ ...  ┆ ...         ┆ ...          │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.0  ┆ 1.0         ┆ (-1.0, 1.0]  │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.5  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.0  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.5  ┆ inf         ┆ (1.0, inf]   │
    └──────┴─────────────┴──────────────┘

    """
    var_nm = s.name

    cuts_df = pli.DataFrame([
        pli.Series(name=break_point_label, values=bins,
                   dtype=Float64).extend_constant(float("inf"), 1)
    ])

    if labels:
        if len(labels) != len(bins) + 1:
            raise ValueError("expected more labels")
        cuts_df = cuts_df.with_column(
            pli.Series(name=category_label, values=labels))
    else:
        cuts_df = cuts_df.with_column(
            pli.format(
                "({}, {}]",
                pli.col(break_point_label).shift_and_fill(1, float("-inf")),
                pli.col(break_point_label),
            ).alias(category_label))

    cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical))

    result = (s.sort().to_frame().join_asof(
        cuts_df,
        left_on=var_nm,
        right_on=break_point_label,
        strategy="forward",
    ))
    return result
コード例 #8
0
ファイル: lazy_frame.py プロジェクト: ghuls/polars
    def join(
        self,
        ldf: "LazyFrame",
        left_on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None,
        right_on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None,
        on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None,
        how: str = "inner",
        suffix: str = "_right",
        allow_parallel: bool = True,
        force_parallel: bool = False,
        asof_by: Optional[Union[str, List[str]]] = None,
        asof_by_left: Optional[Union[str, List[str]]] = None,
        asof_by_right: Optional[Union[str, List[str]]] = None,
    ) -> "LazyFrame":
        """
        Add a join operation to the Logical Plan.

        Parameters
        ----------
        ldf
            Lazy DataFrame to join with.
        left_on
            Join column of the left DataFrame.
        right_on
            Join column of the right DataFrame.
        on
            Join column of both DataFrames. If set, `left_on` and `right_on` should be None.
        how
            one of:
                "inner"
                "left"
                "outer"
                "asof",
                "cross"
        suffix
            Suffix to append to columns with a duplicate name.
        allow_parallel
            Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
        force_parallel
            Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
        asof_by
            join on these columns before doing asof join
        asof_by_left
            join on these columns before doing asof join
        asof_by_right
            join on these columns before doing asof join

        # Asof joins
        This is similar to a left-join except that we match on nearest key rather than equal keys.
        The keys must be sorted to perform an asof join

        """
        if how == "cross":
            return wrap_ldf(
                self._ldf.join(
                    ldf._ldf,
                    [],
                    [],
                    allow_parallel,
                    force_parallel,
                    how,
                    suffix,
                    [],
                    [],
                )
            )

        left_on_: Optional[List[Union[str, pli.Expr]]]
        if isinstance(left_on, (str, pli.Expr)):
            left_on_ = [left_on]
        else:
            left_on_ = left_on

        right_on_: Optional[List[Union[str, pli.Expr]]]
        if isinstance(right_on, (str, pli.Expr)):
            right_on_ = [right_on]
        else:
            right_on_ = right_on

        if isinstance(on, str):
            left_on_ = [on]
            right_on_ = [on]
        elif isinstance(on, list):
            left_on_ = on
            right_on_ = on

        if left_on_ is None or right_on_ is None:
            raise ValueError("You should pass the column to join on as an argument.")

        new_left_on = []
        for column in left_on_:
            if isinstance(column, str):
                column = pli.col(column)
            new_left_on.append(column._pyexpr)
        new_right_on = []
        for column in right_on_:
            if isinstance(column, str):
                column = pli.col(column)
            new_right_on.append(column._pyexpr)

        # set asof_by

        left_asof_by_: Union[List[str], None]
        if isinstance(asof_by_left, str):
            left_asof_by_ = [asof_by_left]
        else:
            left_asof_by_ = asof_by_left

        right_asof_by_: Union[List[str], None]
        if isinstance(asof_by_right, (str, pli.Expr)):
            right_asof_by_ = [asof_by_right]
        else:
            right_asof_by_ = asof_by_right

        if isinstance(asof_by, str):
            left_asof_by_ = [asof_by]
            right_asof_by_ = [asof_by]
        elif isinstance(asof_by, list):
            left_asof_by_ = asof_by
            right_asof_by_ = asof_by

        if left_asof_by_ is None:
            left_asof_by_ = []
        if right_asof_by_ is None:
            right_asof_by_ = []

        return wrap_ldf(
            self._ldf.join(
                ldf._ldf,
                new_left_on,
                new_right_on,
                allow_parallel,
                force_parallel,
                how,
                suffix,
                left_asof_by_,
                right_asof_by_,
            )
        )
コード例 #9
0
ファイル: lazy_frame.py プロジェクト: ghuls/polars
 def interpolate(self) -> "LazyFrame":
     """
     Interpolate intermediate values. The interpolation method is linear.
     """
     return self.select(pli.col("*").interpolate())