def assert_frame_equal_local_categoricals(df_a: pli.DataFrame, df_b: pli.DataFrame) -> None: assert df_a.schema == df_b.schema cat_to_str = pli.col(Categorical).cast(str) assert df_a.with_column(cat_to_str).frame_equal( df_b.with_column(cat_to_str)) cat_to_phys = pli.col(Categorical).to_physical() assert df_a.with_column(cat_to_phys).frame_equal( df_b.with_column(cat_to_phys))
def arg_where(condition: pli.Expr | pli.Series, eager: bool = False) -> pli.Expr | pli.Series: """ Return indices where `condition` evaluates `True`. Parameters ---------- condition Boolean expression to evaluate Examples -------- >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) >>> df.select( ... [ ... pl.arg_where(pl.col("a") % 2 == 0), ... ] ... ).to_series() shape: (2,) Series: 'a' [u32] [ 1 3 ] """ if eager: if not isinstance(condition, pli.Series): raise ValueError( f"expected 'Series' in 'arg_where' if 'eager=True', got {type(condition)}" ) return (condition.to_frame().select(arg_where(pli.col( condition.name))).to_series()) else: condition = pli.expr_to_lit_or_expr(condition, str_to_lit=True) return pli.wrap_expr(py_arg_where(condition._pyexpr))
def _prepare_groupby_inputs( by: Optional[Union[str, List[str], "pli.Expr", List["pli.Expr"]]], ) -> List["PyExpr"]: if isinstance(by, list): new_by = [] for e in by: if isinstance(e, str): e = pli.col(e) new_by.append(e._pyexpr) elif isinstance(by, str): new_by = [pli.col(by)._pyexpr] elif isinstance(by, pli.Expr): new_by = [by._pyexpr] elif by is None: return [] return new_by
def filter(self, predicate: Union["pli.Expr", str]) -> "LazyFrame": """ Filter the rows in the DataFrame based on a predicate expression. Parameters ---------- predicate Expression that evaluates to a boolean Series. Examples -------- >>> lf = pl.DataFrame( ... { ... "foo": [1, 2, 3], ... "bar": [6, 7, 8], ... "ham": ["a", "b", "c"], ... } ... ).lazy() Filter on one condition: >>> lf.filter(pl.col("foo") < 3).collect() shape: (2, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str │ ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ │ 2 ┆ 7 ┆ b │ └─────┴─────┴─────┘ Filter on multiple conditions: >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ str │ ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ └─────┴─────┴─────┘ """ if isinstance(predicate, str): predicate = pli.col(predicate) return wrap_ldf(self._ldf.filter(predicate._pyexpr))
def _post_apply_columns(pydf: PyDataFrame, columns: ColumnsType) -> PyDataFrame: """ Apply 'columns' param _after_ PyDataFrame creation (if no alternative). """ pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes() columns, dtypes = _unpack_columns(columns or pydf_columns) if columns != pydf_columns: pydf.set_column_names(columns) column_casts = [ pli.col(col).cast(dtypes[col])._pyexpr for i, col in enumerate(columns) if col in dtypes and dtypes[col] != pydf_dtypes[i] ] if column_casts: pydf = pydf.lazy().with_columns(column_casts).collect() return pydf
def verify_series_and_expr_api(input: pli.Series, expected: pli.Series | None, op: str, *args: Any, **kwargs: Any) -> None: """ Small helper function to test element-wise functions for both the series and expressions api. Examples -------- >>> s = pl.Series([1, 3, 2]) >>> expected = pl.Series([1, 2, 3]) >>> verify_series_and_expr_api(s, expected, "sort") """ expr = _getattr_multi(pli.col("*"), op)(*args, **kwargs) result_expr: pli.Series = input.to_frame().select( expr)[:, 0] # type: ignore[assignment] result_series = _getattr_multi(input, op)(*args, **kwargs) if expected is None: assert_series_equal(result_series, result_expr) else: assert_series_equal(result_expr, expected) assert_series_equal(result_series, expected)
def cut( s: pli.Series, bins: list[float], labels: Optional[list[str]] = None, break_point_label: str = "break_point", category_label: str = "category", ) -> pli.DataFrame: """ Bin values into discrete values .. warning:: This function is experimental and might change without it being considered a breaking change. Parameters ---------- s Series to bin. bins Bins to create. labels Labels to assign to the bins. If given the length of labels must be len(bins) + 1. break_point_label Name given to the breakpoint column. category_label Name given to the category column. Returns ------- DataFrame Examples -------- >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) >>> pl.cut(a, bins=[-1, 1]) shape: (12, 3) ┌──────┬─────────────┬──────────────┐ │ a ┆ break_point ┆ category │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ cat │ ╞══════╪═════════════╪══════════════╡ │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.5 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.0 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.5 ┆ inf ┆ (1.0, inf] │ └──────┴─────────────┴──────────────┘ """ var_nm = s.name cuts_df = pli.DataFrame([ pli.Series(name=break_point_label, values=bins, dtype=Float64).extend_constant(float("inf"), 1) ]) if labels: if len(labels) != len(bins) + 1: raise ValueError("expected more labels") cuts_df = cuts_df.with_column( pli.Series(name=category_label, values=labels)) else: cuts_df = cuts_df.with_column( pli.format( "({}, {}]", pli.col(break_point_label).shift_and_fill(1, float("-inf")), pli.col(break_point_label), ).alias(category_label)) cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical)) result = (s.sort().to_frame().join_asof( cuts_df, left_on=var_nm, right_on=break_point_label, strategy="forward", )) return result
def join( self, ldf: "LazyFrame", left_on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None, right_on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None, on: Optional[Union[str, "pli.Expr", List[Union[str, "pli.Expr"]]]] = None, how: str = "inner", suffix: str = "_right", allow_parallel: bool = True, force_parallel: bool = False, asof_by: Optional[Union[str, List[str]]] = None, asof_by_left: Optional[Union[str, List[str]]] = None, asof_by_right: Optional[Union[str, List[str]]] = None, ) -> "LazyFrame": """ Add a join operation to the Logical Plan. Parameters ---------- ldf Lazy DataFrame to join with. left_on Join column of the left DataFrame. right_on Join column of the right DataFrame. on Join column of both DataFrames. If set, `left_on` and `right_on` should be None. how one of: "inner" "left" "outer" "asof", "cross" suffix Suffix to append to columns with a duplicate name. allow_parallel Allow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel. force_parallel Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. asof_by join on these columns before doing asof join asof_by_left join on these columns before doing asof join asof_by_right join on these columns before doing asof join # Asof joins This is similar to a left-join except that we match on nearest key rather than equal keys. The keys must be sorted to perform an asof join """ if how == "cross": return wrap_ldf( self._ldf.join( ldf._ldf, [], [], allow_parallel, force_parallel, how, suffix, [], [], ) ) left_on_: Optional[List[Union[str, pli.Expr]]] if isinstance(left_on, (str, pli.Expr)): left_on_ = [left_on] else: left_on_ = left_on right_on_: Optional[List[Union[str, pli.Expr]]] if isinstance(right_on, (str, pli.Expr)): right_on_ = [right_on] else: right_on_ = right_on if isinstance(on, str): left_on_ = [on] right_on_ = [on] elif isinstance(on, list): left_on_ = on right_on_ = on if left_on_ is None or right_on_ is None: raise ValueError("You should pass the column to join on as an argument.") new_left_on = [] for column in left_on_: if isinstance(column, str): column = pli.col(column) new_left_on.append(column._pyexpr) new_right_on = [] for column in right_on_: if isinstance(column, str): column = pli.col(column) new_right_on.append(column._pyexpr) # set asof_by left_asof_by_: Union[List[str], None] if isinstance(asof_by_left, str): left_asof_by_ = [asof_by_left] else: left_asof_by_ = asof_by_left right_asof_by_: Union[List[str], None] if isinstance(asof_by_right, (str, pli.Expr)): right_asof_by_ = [asof_by_right] else: right_asof_by_ = asof_by_right if isinstance(asof_by, str): left_asof_by_ = [asof_by] right_asof_by_ = [asof_by] elif isinstance(asof_by, list): left_asof_by_ = asof_by right_asof_by_ = asof_by if left_asof_by_ is None: left_asof_by_ = [] if right_asof_by_ is None: right_asof_by_ = [] return wrap_ldf( self._ldf.join( ldf._ldf, new_left_on, new_right_on, allow_parallel, force_parallel, how, suffix, left_asof_by_, right_asof_by_, ) )
def interpolate(self) -> "LazyFrame": """ Interpolate intermediate values. The interpolation method is linear. """ return self.select(pli.col("*").interpolate())