Beispiel #1
0
def _assert_series_inner(
    left: Series,
    right: Series,
    check_dtype: bool,
    check_exact: bool,
    atol: float,
    rtol: float,
    obj: str,
) -> None:
    """
    Compares Series dtype + values
    """
    try:
        can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__")
    except NotImplementedError:
        can_be_subtracted = False

    check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean

    if check_dtype:
        if left.dtype != right.dtype:
            raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype)

    if check_exact:
        if (left != right).sum() != 0:
            raise_assert_detail(obj,
                                "Exact value mismatch",
                                left=list(left),
                                right=list(right))
    else:
        if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0:
            raise_assert_detail(obj,
                                "Value mismatch",
                                left=list(left),
                                right=list(right))
Beispiel #2
0
def _assert_series_inner(
    left: pli.Series,
    right: pli.Series,
    check_dtype: bool,
    check_exact: bool,
    nans_compare_equal: bool,
    atol: float,
    rtol: float,
    obj: str,
) -> None:
    """
    Compares Series dtype + values
    """
    try:
        can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__")
    except NotImplementedError:
        can_be_subtracted = False

    check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean
    if check_dtype:
        if left.dtype != right.dtype:
            raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype)

    # create mask of which (if any) values are unequal
    unequal = left != right
    if unequal.any() and nans_compare_equal and left.dtype in (Float32,
                                                               Float64):
        # handle NaN values (which compare unequal to themselves)
        unequal = unequal & ~(
            (left.is_nan() & right.is_nan()).fill_null(pli.lit(False)))

    # assert exact, or with tolerance
    if unequal.any():
        if check_exact:
            raise_assert_detail(obj,
                                "Exact value mismatch",
                                left=list(left),
                                right=list(right))
        else:
            # apply check with tolerance, but only to the known-unequal matches
            left, right = left.filter(unequal), right.filter(unequal)
            if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0:
                raise_assert_detail(obj,
                                    "Value mismatch",
                                    left=list(left),
                                    right=list(right))
Beispiel #3
0
def series_to_pydf(data: pli.Series,
                   columns: ColumnsType | None = None) -> PyDataFrame:
    """
    Construct a PyDataFrame from a Polars Series.
    """
    data_series = [data.inner()]
    series_name = [s.name() for s in data_series]
    columns, dtypes = _unpack_columns(columns or series_name, n_expected=1)
    if dtypes:
        new_dtype = list(dtypes.values())[0]
        if new_dtype != data.dtype:
            data_series[0] = data_series[0].cast(new_dtype, True)

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Beispiel #4
0
def verify_series_and_expr_api(input: pli.Series, expected: pli.Series | None,
                               op: str, *args: Any, **kwargs: Any) -> None:
    """
    Small helper function to test element-wise functions for both the series and expressions api.

    Examples
    --------
    >>> s = pl.Series([1, 3, 2])
    >>> expected = pl.Series([1, 2, 3])
    >>> verify_series_and_expr_api(s, expected, "sort")
    """
    expr = _getattr_multi(pli.col("*"), op)(*args, **kwargs)
    result_expr: pli.Series = input.to_frame().select(
        expr)[:, 0]  # type: ignore[assignment]
    result_series = _getattr_multi(input, op)(*args, **kwargs)
    if expected is None:
        assert_series_equal(result_series, result_expr)
    else:
        assert_series_equal(result_expr, expected)
        assert_series_equal(result_series, expected)
Beispiel #5
0
def cut(
    s: pli.Series,
    bins: list[float],
    labels: Optional[list[str]] = None,
    break_point_label: str = "break_point",
    category_label: str = "category",
) -> pli.DataFrame:
    """
    Bin values into discrete values

    .. warning::
        This function is experimental and might change without it being considered a breaking change.

    Parameters
    ----------
    s
        Series to bin.
    bins
        Bins to create.
    labels
        Labels to assign to the bins. If given the length of labels must be len(bins) + 1.
    break_point_label
        Name given to the breakpoint column.
    category_label
        Name given to the category column.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
    >>> pl.cut(a, bins=[-1, 1])
    shape: (12, 3)
    ┌──────┬─────────────┬──────────────┐
    │ a    ┆ break_point ┆ category     │
    │ ---  ┆ ---         ┆ ---          │
    │ f64  ┆ f64         ┆ cat          │
    ╞══════╪═════════════╪══════════════╡
    │ -3.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -1.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ ...  ┆ ...         ┆ ...          │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.0  ┆ 1.0         ┆ (-1.0, 1.0]  │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.5  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.0  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.5  ┆ inf         ┆ (1.0, inf]   │
    └──────┴─────────────┴──────────────┘

    """
    var_nm = s.name

    cuts_df = pli.DataFrame([
        pli.Series(name=break_point_label, values=bins,
                   dtype=Float64).extend_constant(float("inf"), 1)
    ])

    if labels:
        if len(labels) != len(bins) + 1:
            raise ValueError("expected more labels")
        cuts_df = cuts_df.with_column(
            pli.Series(name=category_label, values=labels))
    else:
        cuts_df = cuts_df.with_column(
            pli.format(
                "({}, {}]",
                pli.col(break_point_label).shift_and_fill(1, float("-inf")),
                pli.col(break_point_label),
            ).alias(category_label))

    cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical))

    result = (s.sort().to_frame().join_asof(
        cuts_df,
        left_on=var_nm,
        right_on=break_point_label,
        strategy="forward",
    ))
    return result
Beispiel #6
0
def from_pandas(
    df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
    rechunk: bool = True,
    nan_to_none: bool = True,
) -> Union[DataFrame, Series]:
    """
    Construct a Polars DataFrame or Series from a pandas DataFrame or Series.

    Requires the pandas package to be installed.

    Parameters
    ----------
    df : pandas DataFrame, Series, or DatetimeIndex
        Data represented as a pandas DataFrame, Series, or DatetimeIndex.
    rechunk : bool, default True
        Make sure that all data is contiguous.
    nan_to_none : bool, default True
        If data contains NaN values PyArrow will convert the NaN to None

    Returns
    -------
    DataFrame

    Examples
    --------
    Constructing a DataFrame from a pandas DataFrame:

    >>> import pandas as pd
    >>> pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
    >>> df = pl.from_pandas(pd_df)
    >>> df
        shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 2   ┆ 3   │
    ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
    │ 4   ┆ 5   ┆ 6   │
    └─────┴─────┴─────┘

    Constructing a Series from a pandas Series:

    >>> import pandas as pd
    >>> pd_series = pd.Series([1, 2, 3], name="pd")
    >>> df = pl.from_pandas(pd_series)
    >>> df
    shape: (3,)
    Series: 'pd' [i64]
    [
        1
        2
        3
    ]

    """
    try:
        import pandas as pd
    except ImportError as e:  # pragma: no cover
        raise ImportError(
            "'pandas' is required when using from_pandas().") from e

    if isinstance(df, (pd.Series, pd.DatetimeIndex)):
        return Series._from_pandas("", df, nan_to_none=nan_to_none)
    elif isinstance(df, pd.DataFrame):
        return DataFrame._from_pandas(df,
                                      rechunk=rechunk,
                                      nan_to_none=nan_to_none)
    else:
        raise ValueError(
            f"Expected pandas DataFrame or Series, got {type(df)}.")
Beispiel #7
0
def from_arrow(a: Union["pa.Table", "pa.Array", "pa.ChunkedArray"],
               rechunk: bool = True) -> Union[DataFrame, Series]:
    """
    Create a DataFrame or Series from an Arrow Table or Array.

    This operation will be zero copy for the most part. Types that are not
    supported by Polars may be cast to the closest supported type.

    Parameters
    ----------
    a : Arrow Table or Array
        Data represented as Arrow Table or Array.
    rechunk : bool, default True
        Make sure that all data is contiguous.

    Returns
    -------
    DataFrame or Series

    Examples
    --------
    Constructing a DataFrame from an Arrow Table:

    >>> import pyarrow as pa
    >>> data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
    >>> df = pl.from_arrow(data)
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    Constructing a Series from an Arrow Array:

    >>> import pyarrow as pa
    >>> data = pa.array([1, 2, 3])
    >>> series = pl.from_arrow(data)
    >>> series
    shape: (3,)
    Series: '' [i64]
    [
        1
        2
        3
    ]

    """
    if not _PYARROW_AVAILABLE:
        raise ImportError("'pyarrow' is required when using from_arrow()."
                          )  # pragma: no cover
    if isinstance(a, pa.Table):
        return DataFrame._from_arrow(a, rechunk=rechunk)
    elif isinstance(a, (pa.Array, pa.ChunkedArray)):
        return Series._from_arrow("", a, rechunk)
    else:
        raise ValueError(f"Expected Arrow Table or Array, got {type(a)}.")
Beispiel #8
0
def series_to_pyseries(name: str, values: pli.Series) -> PySeries:
    """
    Construct a PySeries from a Polars Series.
    """
    values.rename(name, in_place=True)
    return values.inner()