Esempio n. 1
0
def arange(
    low: int | pli.Expr | pli.Series,
    high: int | pli.Expr | pli.Series,
    step: int = 1,
    *,
    eager: bool = False,
) -> pli.Expr | pli.Series:
    """
    Create a range expression. This can be used in a `select`, `with_column` etc.
    Be sure that the range size is equal to the DataFrame you are collecting.

    Examples
    --------
    >>> df.lazy().filter(pl.col("foo") < pl.arange(0, 100)).collect()  # doctest: +SKIP

    Parameters
    ----------
    low
        Lower bound of range.
    high
        Upper bound of range.
    step
        Step size of the range.
    eager
        If eager evaluation is `True`, a Series is returned instead of an Expr.
    """
    low = pli.expr_to_lit_or_expr(low, str_to_lit=False)
    high = pli.expr_to_lit_or_expr(high, str_to_lit=False)

    if eager:
        df = pli.DataFrame({"a": [1]})
        return df.select(arange(low, high, step).alias("arange"))["arange"]

    return pli.wrap_expr(pyarange(low._pyexpr, high._pyexpr, step))
Esempio n. 2
0
        def draw_frames(draw: Callable) -> pli.DataFrame | pli.LazyFrame:
            # if not given, create 'n' cols with random dtypes
            if cols is None:
                n = between(draw,
                            int,
                            min_=(min_cols or 0),
                            max_=(max_cols or MAX_COLS))
                dtypes_ = [
                    draw(sampled_from(selectable_dtypes)) for _ in range(n)
                ]
                coldefs = columns(cols=n, dtype=dtypes_)
            elif isinstance(cols, column):
                coldefs = [cols]
            else:
                coldefs = list(cols)  # type: ignore[arg-type]

            # append any explicitly provided cols
            coldefs.extend(include_cols or ())

            # if not given, assign dataframe/series size
            series_size = (between(draw,
                                   int,
                                   min_=(min_size or 0),
                                   max_=(max_size or MAX_DATA_SIZE))
                           if size is None else size)
            # init dataframe from generated series data; series data is
            # given as a python-native sequence (TODO: or as an arrow array).
            for idx, c in enumerate(coldefs):
                if c.name is None:
                    c.name = f"col{idx}"
                if c.null_probability is None:
                    if isinstance(null_probability, dict):
                        c.null_probability = null_probability.get(c.name, 0.0)
                    else:
                        c.null_probability = null_probability

            frame_columns = [
                c.name if (c.dtype is None) else (c.name, c.dtype)
                for c in coldefs
            ]
            df = pli.DataFrame(
                data={
                    c.name: draw(
                        series(
                            name=c.name,
                            dtype=c.dtype,
                            size=series_size,
                            null_probability=(c.null_probability or 0.0),
                            strategy=c.strategy,
                            unique=c.unique,
                        ))
                    for c in coldefs
                },
                columns=frame_columns,  # type: ignore[arg-type]
            )
            # if indicated, make lazy
            return df.lazy() if lazy else df
Esempio n. 3
0
def arrow_to_pydf(data: "pa.Table",
                  columns: Optional[Sequence[str]] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if not _PYARROW_AVAILABLE:  # pragma: no cover
        raise ImportError(
            "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table."
        )
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    # dictionaries cannot be build in different batches (categorical does not allow that)
    # so we rechunk them and create them separate.
    dictionary_cols = {}
    names = []
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name
        names.append(name)

        column = coerce_arrow(column)
        if pa.types.is_dictionary(column.type):
            ps = arrow_to_pyseries(name, column, rechunk)
            dictionary_cols[i] = pli.wrap_s(ps)
        else:
            data_dict[name] = column

    if len(data_dict) > 0:
        tbl = pa.table(data_dict)

        # path for table without rows that keeps datatype
        if tbl.shape[0] == 0:
            pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df
        else:
            pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches())
    else:
        pydf = pli.DataFrame([])._df
    if rechunk:
        pydf = pydf.rechunk()

    if len(dictionary_cols) > 0:
        df = pli.wrap_df(pydf)
        for i, s in dictionary_cols.items():
            df[s.name] = s
        df = df[names]
        pydf = df._df
    return pydf
Esempio n. 4
0
def select(
    exprs: Union[str, "pli.Expr", Sequence[str], Sequence["pli.Expr"],
                 "pli.Series"]
) -> "pli.DataFrame":
    """
    Run polars expressions without a context.

    This is syntactic sugar for running `df.select` on an empty DataFrame.

    Parameters
    ----------
    exprs
        Expressions to run
    Returns
    -------
    DataFrame

    Examples
    --------

    >>> foo = pl.Series("foo", [1, 2, 3])
    >>> bar = pl.Series("bar", [3, 2, 1])
    >>> pl.select(
    ...     [
    ...         pl.min([foo, bar]),
    ...     ]
    ... )
    shape: (3, 1)
    ┌─────┐
    │ min │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    ├╌╌╌╌╌┤
    │ 2   │
    ├╌╌╌╌╌┤
    │ 1   │
    └─────┘

    """
    return pli.DataFrame([]).select(exprs)
Esempio n. 5
0
def cut(
    s: pli.Series,
    bins: list[float],
    labels: Optional[list[str]] = None,
    break_point_label: str = "break_point",
    category_label: str = "category",
) -> pli.DataFrame:
    """
    Bin values into discrete values

    .. warning::
        This function is experimental and might change without it being considered a breaking change.

    Parameters
    ----------
    s
        Series to bin.
    bins
        Bins to create.
    labels
        Labels to assign to the bins. If given the length of labels must be len(bins) + 1.
    break_point_label
        Name given to the breakpoint column.
    category_label
        Name given to the category column.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
    >>> pl.cut(a, bins=[-1, 1])
    shape: (12, 3)
    ┌──────┬─────────────┬──────────────┐
    │ a    ┆ break_point ┆ category     │
    │ ---  ┆ ---         ┆ ---          │
    │ f64  ┆ f64         ┆ cat          │
    ╞══════╪═════════════╪══════════════╡
    │ -3.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -1.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ ...  ┆ ...         ┆ ...          │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.0  ┆ 1.0         ┆ (-1.0, 1.0]  │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.5  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.0  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.5  ┆ inf         ┆ (1.0, inf]   │
    └──────┴─────────────┴──────────────┘

    """
    var_nm = s.name

    cuts_df = pli.DataFrame([
        pli.Series(name=break_point_label, values=bins,
                   dtype=Float64).extend_constant(float("inf"), 1)
    ])

    if labels:
        if len(labels) != len(bins) + 1:
            raise ValueError("expected more labels")
        cuts_df = cuts_df.with_column(
            pli.Series(name=category_label, values=labels))
    else:
        cuts_df = cuts_df.with_column(
            pli.format(
                "({}, {}]",
                pli.col(break_point_label).shift_and_fill(1, float("-inf")),
                pli.col(break_point_label),
            ).alias(category_label))

    cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical))

    result = (s.sort().to_frame().join_asof(
        cuts_df,
        left_on=var_nm,
        right_on=break_point_label,
        strategy="forward",
    ))
    return result