def apply( exprs: List[Union[str, "pli.Expr"]], f: Callable[[List["pli.Series"]], Union["pli.Series", Any]], return_dtype: Optional[Type[DataType]] = None, ) -> "pli.Expr": """ Apply a custom function in a GroupBy context. Depending on the context it has the following behavior: ## Context * Select/Project Don't do this, use `map` * GroupBy expected type `f`: Callable[[Series], Series] Applies a python function over each group. Parameters ---------- exprs Input Series to f f Function to apply over the input return_dtype dtype of the output Series Returns ------- Expr """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=True))
def sort( self, by: Union[str, "pli.Expr", List[str], List["pli.Expr"]], reverse: Union[bool, List[bool]] = False, ) -> "LazyFrame": """ Sort the DataFrame by: - A single column name - An expression - Multiple expressions Parameters ---------- by Column (expressions) to sort by. reverse Whether or not to sort in reverse order. """ if type(by) is str: return wrap_ldf(self._ldf.sort(by, reverse)) if type(reverse) is bool: reverse = [reverse] by = pli.selection_to_pyexpr_list(by) return wrap_ldf(self._ldf.sort_by_exprs(by, reverse))
def apply( exprs: list[str | pli.Expr], f: Callable[[list[pli.Series]], pli.Series | Any], return_dtype: type[DataType] | None = None, ) -> pli.Expr: """ Apply a custom function in a GroupBy context. Depending on the context it has the following behavior: * Select Don't use apply, use `map` * GroupBy expected type `f`: Callable[[Series], Series] Applies a python function over each group. Parameters ---------- exprs Input Series to f f Function to apply over the input return_dtype dtype of the output Series Returns ------- Expr """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=True))
def agg(self, aggs: Union[List["pli.Expr"], "pli.Expr"]) -> "LazyFrame": """ Describe the aggregation that need to be done on a group. Parameters ---------- aggs Single/ Multiple aggregation expression(s). Examples -------- >>> ( ... pl.scan_csv("data.csv") ... .groupby("groups") ... .agg( ... [ ... pl.col("name").n_unique().alias("unique_names"), ... pl.max("values"), ... ] ... ) ... ) # doctest: +SKIP """ aggs = pli.selection_to_pyexpr_list(aggs) return wrap_ldf(self.lgb.agg(aggs))
def fold( acc: pli.IntoExpr, f: Callable[[pli.Series, pli.Series], pli.Series], exprs: Sequence[pli.Expr | str] | pli.Expr, ) -> pli.Expr: """ Accumulate over multiple columns horizontally/ row wise with a left fold. Parameters ---------- acc Accumulator Expression. This is the value that will be initialized when the fold starts. For a sum this could for instance be lit(0). f Function to apply over the accumulator and the value. Fn(acc, value) -> new_value exprs Expressions to aggregate over. May also be a wildcard expression. """ # in case of pl.col("*") acc = pli.expr_to_lit_or_expr(acc, str_to_lit=True) if isinstance(exprs, pli.Expr): exprs = [exprs] exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(pyfold(acc._pyexpr, f, exprs))
def struct( exprs: Union[Sequence[Union["pli.Expr", str]], "pli.Expr"]) -> "pli.Expr": """ Collect several columns into a Series of dtype Struct Parameters ---------- exprs Columns/Expressions to collect into a Struct Examples -------- >>> pl.DataFrame( ... { ... "int": [1, 2], ... "str": ["a", "b"], ... "bool": [True, None], ... "list": [[1, 2], [3]], ... } ... ).select([pl.struct(pl.all()).alias("my_struct")]) shape: (2, 1) ┌───────────────────────┐ │ my_struct │ │ --- │ │ struct{int, ... list} │ ╞═══════════════════════╡ │ {1,"a",true,[1, 2]} │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ {2,"b",null,[3]} │ └───────────────────────┘ Only collect specific columns as a struct: >>> df = pl.DataFrame( ... {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]} ... ) >>> df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b")) shape: (4, 4) ┌─────┬───────┬─────┬───────────────────────────────┐ │ a ┆ b ┆ c ┆ a_and_b │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ i64 ┆ struct[2]{'a': i64, 'b': str} │ ╞═════╪═══════╪═════╪═══════════════════════════════╡ │ 1 ┆ one ┆ 9 ┆ {1,"one"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ two ┆ 8 ┆ {2,"two"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 3 ┆ three ┆ 7 ┆ {3,"three"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ four ┆ 6 ┆ {4,"four"} │ └─────┴───────┴─────┴───────────────────────────────┘ """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_as_struct(exprs))
def concat_list( exprs: Union[Sequence[Union[str, "pli.Expr", "pli.Series"]], "pli.Expr"] ) -> "pli.Expr": """ Concat the arrays in a Series dtype List in linear time. Parameters ---------- exprs Columns to concat into a List Series Examples -------- Create lagged columns and collect them into a list. This mimics a rolling window. >>> df = pl.DataFrame( ... { ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], ... } ... ) >>> ( ... df.with_columns( ... [pl.col("A").shift(i).alias(f"A_lag_{i}") for i in range(3)] ... ).select( ... [ ... pl.concat_list([f"A_lag_{i}" for i in range(3)][::-1]).alias( ... "A_rolling" ... ) ... ] ... ) ... ) shape: (5, 1) ┌─────────────────┐ │ A_rolling │ │ --- │ │ list [f64] │ ╞═════════════════╡ │ [null, null, 1] │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ [null, 1, 2] │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ [1, 2, 9] │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ [2, 9, 2] │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ [9, 2, 13] │ └─────────────────┘ """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_concat_lst(exprs))
def concat_str(exprs: Union[Sequence[Union["pli.Expr", str]], "pli.Expr"], sep: str = "") -> "pli.Expr": """ Horizontally Concat Utf8 Series in linear time. Non utf8 columns are cast to utf8. Parameters ---------- exprs Columns to concat into a Utf8 Series sep String value that will be used to separate the values. """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_concat_str(exprs, sep))
def concat_str(exprs: Sequence[pli.Expr | str] | pli.Expr, sep: str = "") -> pli.Expr: """ Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8. Parameters ---------- exprs Columns to concat into a Utf8 Series. sep String value that will be used to separate the values. Examples -------- >>> df = pl.DataFrame( ... { ... "a": [1, 2, 3], ... "b": ["dogs", "cats", None], ... "c": ["play", "swim", "walk"], ... } ... ) >>> df.with_columns( ... [ ... pl.concat_str( ... [ ... pl.col("a") * 2, ... pl.col("b"), ... pl.col("c"), ... ], ... sep=" ", ... ).alias("full_sentence"), ... ] ... ) shape: (3, 4) ┌─────┬──────┬──────┬───────────────┐ │ a ┆ b ┆ c ┆ full_sentence │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ str │ ╞═════╪══════╪══════╪═══════════════╡ │ 1 ┆ dogs ┆ play ┆ 2 dogs play │ ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ cats ┆ swim ┆ 4 cats swim │ ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 3 ┆ null ┆ walk ┆ null │ └─────┴──────┴──────┴───────────────┘ """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_concat_str(exprs, sep))
def select( self, exprs: Union[ str, "pli.Expr", Sequence[str], Sequence["pli.Expr"], "pli.Series" ], ) -> "LazyFrame": """ Select columns from this DataFrame. Parameters ---------- exprs Column or columns to select. """ exprs = pli.selection_to_pyexpr_list(exprs) return wrap_ldf(self._ldf.select(exprs))
def min(column: str | list[pli.Expr | str] | pli.Series) -> pli.Expr | Any: """ Get the minimum value. column Column(s) to be used in aggregation. Will lead to different behavior based on the input. input: - Union[str, Series] -> aggregate the sum value of that column. - List[Expr] -> aggregate the sum value horizontally. """ if isinstance(column, pli.Series): return column.min() elif isinstance(column, list): exprs = pli.selection_to_pyexpr_list(column) return pli.wrap_expr(_min_exprs(exprs)) else: return col(column).min()
def argsort_by(exprs: List[Union["pli.Expr", str]], reverse: Union[List[bool], bool] = False) -> "pli.Expr": """ Find the indexes that would sort the columns. Argsort by multiple columns. The first column will be used for the ordering. If there are duplicates in the first column, the second column will be used to determine the ordering and so on. Parameters ---------- exprs Columns use to determine the ordering. reverse Default is ascending. """ if not isinstance(reverse, list): reverse = [reverse] * len(exprs) exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(pyargsort_by(exprs, reverse))
def max( column: Union[str, List[Union["pli.Expr", str]], "pli.Series"] ) -> Union["pli.Expr", Any]: """ Get the maximum value. Can be used horizontally or vertically. Parameters ---------- column Column(s) to be used in aggregation. Will lead to different behavior based on the input. input: - Union[str, Series] -> aggregate the maximum value of that column. - List[Expr] -> aggregate the maximum value horizontally. """ if isinstance(column, pli.Series): return column.max() elif isinstance(column, list): exprs = pli.selection_to_pyexpr_list(column) return pli.wrap_expr(_max_exprs(exprs)) else: return col(column).max()
def map( exprs: list[str] | list[pli.Expr], f: Callable[[list[pli.Series]], pli.Series], return_dtype: type[DataType] | None = None, ) -> pli.Expr: """ Map a custom function over multiple columns/expressions and produce a single Series result. Parameters ---------- exprs Input Series to f f Function to apply over the input return_dtype dtype of the output Series Returns ------- Expr """ exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=False))
def argsort_by( exprs: pli.Expr | str | Sequence[pli.Expr | str], reverse: list[bool] | bool = False, ) -> pli.Expr: """ Find the indexes that would sort the columns. Argsort by multiple columns. The first column will be used for the ordering. If there are duplicates in the first column, the second column will be used to determine the ordering and so on. Parameters ---------- exprs Columns use to determine the ordering. reverse Default is ascending. """ if isinstance(exprs, str) or not isinstance(exprs, Sequence): exprs = [exprs] if isinstance(reverse, bool): reverse = [reverse] * len(exprs) exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(pyargsort_by(exprs, reverse))
def explode( self, columns: Union[str, List[str], "pli.Expr", List["pli.Expr"]] ) -> "LazyFrame": """ Explode lists to long format. Examples -------- >>> df = pl.DataFrame( ... { ... "letters": ["c", "c", "a", "c", "a", "b"], ... "nrs": [[1, 2], [1, 3], [4, 3], [5, 5, 5], [6], [2, 1, 2]], ... } ... ) >>> df shape: (6, 2) ┌─────────┬────────────┐ │ letters ┆ nrs │ │ --- ┆ --- │ │ str ┆ list [i64] │ ╞═════════╪════════════╡ │ c ┆ [1, 2] │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ c ┆ [1, 3] │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ a ┆ [4, 3] │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ c ┆ [5, 5, 5] │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ a ┆ [6] │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ b ┆ [2, 1, 2] │ └─────────┴────────────┘ >>> df.explode("nrs") shape: (13, 2) ┌─────────┬─────┐ │ letters ┆ nrs │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═════════╪═════╡ │ c ┆ 1 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ c ┆ 2 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ c ┆ 1 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ c ┆ 3 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ ... ┆ ... │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ a ┆ 6 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ b ┆ 2 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ b ┆ 1 │ ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ │ b ┆ 2 │ └─────────┴─────┘ """ columns = pli.selection_to_pyexpr_list(columns) return wrap_ldf(self._ldf.explode(columns))
def struct( exprs: Sequence[pli.Expr | str | pli.Series] | pli.Expr | pli.Series, eager: bool = False, ) -> pli.Expr | pli.Series: """ Collect several columns into a Series of dtype Struct Parameters ---------- exprs Columns/Expressions to collect into a Struct eager Evaluate immediately Examples -------- >>> pl.DataFrame( ... { ... "int": [1, 2], ... "str": ["a", "b"], ... "bool": [True, None], ... "list": [[1, 2], [3]], ... } ... ).select([pl.struct(pl.all()).alias("my_struct")]) shape: (2, 1) ┌─────────────────────┐ │ my_struct │ │ --- │ │ struct[4] │ ╞═════════════════════╡ │ {1,"a",true,[1, 2]} │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ {2,"b",null,[3]} │ └─────────────────────┘ Only collect specific columns as a struct: >>> df = pl.DataFrame( ... {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]} ... ) >>> df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b")) shape: (4, 4) ┌─────┬───────┬─────┬─────────────┐ │ a ┆ b ┆ c ┆ a_and_b │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ i64 ┆ struct[2] │ ╞═════╪═══════╪═════╪═════════════╡ │ 1 ┆ one ┆ 9 ┆ {1,"one"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ two ┆ 8 ┆ {2,"two"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 3 ┆ three ┆ 7 ┆ {3,"three"} │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ four ┆ 6 ┆ {4,"four"} │ └─────┴───────┴─────┴─────────────┘ """ if eager: return pli.select(struct(exprs, eager=False)).to_series() exprs = pli.selection_to_pyexpr_list(exprs) return pli.wrap_expr(_as_struct(exprs))