def arange( low: int | pli.Expr | pli.Series, high: int | pli.Expr | pli.Series, step: int = 1, *, eager: bool = False, ) -> pli.Expr | pli.Series: """ Create a range expression. This can be used in a `select`, `with_column` etc. Be sure that the range size is equal to the DataFrame you are collecting. Examples -------- >>> df.lazy().filter(pl.col("foo") < pl.arange(0, 100)).collect() # doctest: +SKIP Parameters ---------- low Lower bound of range. high Upper bound of range. step Step size of the range. eager If eager evaluation is `True`, a Series is returned instead of an Expr. """ low = pli.expr_to_lit_or_expr(low, str_to_lit=False) high = pli.expr_to_lit_or_expr(high, str_to_lit=False) if eager: df = pli.DataFrame({"a": [1]}) return df.select(arange(low, high, step).alias("arange"))["arange"] return pli.wrap_expr(pyarange(low._pyexpr, high._pyexpr, step))
def draw_frames(draw: Callable) -> pli.DataFrame | pli.LazyFrame: # if not given, create 'n' cols with random dtypes if cols is None: n = between(draw, int, min_=(min_cols or 0), max_=(max_cols or MAX_COLS)) dtypes_ = [ draw(sampled_from(selectable_dtypes)) for _ in range(n) ] coldefs = columns(cols=n, dtype=dtypes_) elif isinstance(cols, column): coldefs = [cols] else: coldefs = list(cols) # type: ignore[arg-type] # append any explicitly provided cols coldefs.extend(include_cols or ()) # if not given, assign dataframe/series size series_size = (between(draw, int, min_=(min_size or 0), max_=(max_size or MAX_DATA_SIZE)) if size is None else size) # init dataframe from generated series data; series data is # given as a python-native sequence (TODO: or as an arrow array). for idx, c in enumerate(coldefs): if c.name is None: c.name = f"col{idx}" if c.null_probability is None: if isinstance(null_probability, dict): c.null_probability = null_probability.get(c.name, 0.0) else: c.null_probability = null_probability frame_columns = [ c.name if (c.dtype is None) else (c.name, c.dtype) for c in coldefs ] df = pli.DataFrame( data={ c.name: draw( series( name=c.name, dtype=c.dtype, size=series_size, null_probability=(c.null_probability or 0.0), strategy=c.strategy, unique=c.unique, )) for c in coldefs }, columns=frame_columns, # type: ignore[arg-type] ) # if indicated, make lazy return df.lazy() if lazy else df
def arrow_to_pydf(data: "pa.Table", columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table." ) if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} # dictionaries cannot be build in different batches (categorical does not allow that) # so we rechunk them and create them separate. dictionary_cols = {} names = [] for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name names.append(name) column = coerce_arrow(column) if pa.types.is_dictionary(column.type): ps = arrow_to_pyseries(name, column, rechunk) dictionary_cols[i] = pli.wrap_s(ps) else: data_dict[name] = column if len(data_dict) > 0: tbl = pa.table(data_dict) # path for table without rows that keeps datatype if tbl.shape[0] == 0: pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df else: pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches()) else: pydf = pli.DataFrame([])._df if rechunk: pydf = pydf.rechunk() if len(dictionary_cols) > 0: df = pli.wrap_df(pydf) for i, s in dictionary_cols.items(): df[s.name] = s df = df[names] pydf = df._df return pydf
def select( exprs: Union[str, "pli.Expr", Sequence[str], Sequence["pli.Expr"], "pli.Series"] ) -> "pli.DataFrame": """ Run polars expressions without a context. This is syntactic sugar for running `df.select` on an empty DataFrame. Parameters ---------- exprs Expressions to run Returns ------- DataFrame Examples -------- >>> foo = pl.Series("foo", [1, 2, 3]) >>> bar = pl.Series("bar", [3, 2, 1]) >>> pl.select( ... [ ... pl.min([foo, bar]), ... ] ... ) shape: (3, 1) ┌─────┐ │ min │ │ --- │ │ i64 │ ╞═════╡ │ 1 │ ├╌╌╌╌╌┤ │ 2 │ ├╌╌╌╌╌┤ │ 1 │ └─────┘ """ return pli.DataFrame([]).select(exprs)
def cut( s: pli.Series, bins: list[float], labels: Optional[list[str]] = None, break_point_label: str = "break_point", category_label: str = "category", ) -> pli.DataFrame: """ Bin values into discrete values .. warning:: This function is experimental and might change without it being considered a breaking change. Parameters ---------- s Series to bin. bins Bins to create. labels Labels to assign to the bins. If given the length of labels must be len(bins) + 1. break_point_label Name given to the breakpoint column. category_label Name given to the category column. Returns ------- DataFrame Examples -------- >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) >>> pl.cut(a, bins=[-1, 1]) shape: (12, 3) ┌──────┬─────────────┬──────────────┐ │ a ┆ break_point ┆ category │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ cat │ ╞══════╪═════════════╪══════════════╡ │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.5 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.0 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.5 ┆ inf ┆ (1.0, inf] │ └──────┴─────────────┴──────────────┘ """ var_nm = s.name cuts_df = pli.DataFrame([ pli.Series(name=break_point_label, values=bins, dtype=Float64).extend_constant(float("inf"), 1) ]) if labels: if len(labels) != len(bins) + 1: raise ValueError("expected more labels") cuts_df = cuts_df.with_column( pli.Series(name=category_label, values=labels)) else: cuts_df = cuts_df.with_column( pli.format( "({}, {}]", pli.col(break_point_label).shift_and_fill(1, float("-inf")), pli.col(break_point_label), ).alias(category_label)) cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical)) result = (s.sort().to_frame().join_asof( cuts_df, left_on=var_nm, right_on=break_point_label, strategy="forward", )) return result