def _assert_series_inner( left: Series, right: Series, check_dtype: bool, check_exact: bool, atol: float, rtol: float, obj: str, ) -> None: """ Compares Series dtype + values """ try: can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__") except NotImplementedError: can_be_subtracted = False check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean if check_dtype: if left.dtype != right.dtype: raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype) if check_exact: if (left != right).sum() != 0: raise_assert_detail(obj, "Exact value mismatch", left=list(left), right=list(right)) else: if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0: raise_assert_detail(obj, "Value mismatch", left=list(left), right=list(right))
def _assert_series_inner( left: pli.Series, right: pli.Series, check_dtype: bool, check_exact: bool, nans_compare_equal: bool, atol: float, rtol: float, obj: str, ) -> None: """ Compares Series dtype + values """ try: can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__") except NotImplementedError: can_be_subtracted = False check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean if check_dtype: if left.dtype != right.dtype: raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype) # create mask of which (if any) values are unequal unequal = left != right if unequal.any() and nans_compare_equal and left.dtype in (Float32, Float64): # handle NaN values (which compare unequal to themselves) unequal = unequal & ~( (left.is_nan() & right.is_nan()).fill_null(pli.lit(False))) # assert exact, or with tolerance if unequal.any(): if check_exact: raise_assert_detail(obj, "Exact value mismatch", left=list(left), right=list(right)) else: # apply check with tolerance, but only to the known-unequal matches left, right = left.filter(unequal), right.filter(unequal) if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0: raise_assert_detail(obj, "Value mismatch", left=list(left), right=list(right))
def series_to_pydf(data: pli.Series, columns: ColumnsType | None = None) -> PyDataFrame: """ Construct a PyDataFrame from a Polars Series. """ data_series = [data.inner()] series_name = [s.name() for s in data_series] columns, dtypes = _unpack_columns(columns or series_name, n_expected=1) if dtypes: new_dtype = list(dtypes.values())[0] if new_dtype != data.dtype: data_series[0] = data_series[0].cast(new_dtype, True) data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def verify_series_and_expr_api(input: pli.Series, expected: pli.Series | None, op: str, *args: Any, **kwargs: Any) -> None: """ Small helper function to test element-wise functions for both the series and expressions api. Examples -------- >>> s = pl.Series([1, 3, 2]) >>> expected = pl.Series([1, 2, 3]) >>> verify_series_and_expr_api(s, expected, "sort") """ expr = _getattr_multi(pli.col("*"), op)(*args, **kwargs) result_expr: pli.Series = input.to_frame().select( expr)[:, 0] # type: ignore[assignment] result_series = _getattr_multi(input, op)(*args, **kwargs) if expected is None: assert_series_equal(result_series, result_expr) else: assert_series_equal(result_expr, expected) assert_series_equal(result_series, expected)
def cut( s: pli.Series, bins: list[float], labels: Optional[list[str]] = None, break_point_label: str = "break_point", category_label: str = "category", ) -> pli.DataFrame: """ Bin values into discrete values .. warning:: This function is experimental and might change without it being considered a breaking change. Parameters ---------- s Series to bin. bins Bins to create. labels Labels to assign to the bins. If given the length of labels must be len(bins) + 1. break_point_label Name given to the breakpoint column. category_label Name given to the category column. Returns ------- DataFrame Examples -------- >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) >>> pl.cut(a, bins=[-1, 1]) shape: (12, 3) ┌──────┬─────────────┬──────────────┐ │ a ┆ break_point ┆ category │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ cat │ ╞══════╪═════════════╪══════════════╡ │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.5 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.0 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.5 ┆ inf ┆ (1.0, inf] │ └──────┴─────────────┴──────────────┘ """ var_nm = s.name cuts_df = pli.DataFrame([ pli.Series(name=break_point_label, values=bins, dtype=Float64).extend_constant(float("inf"), 1) ]) if labels: if len(labels) != len(bins) + 1: raise ValueError("expected more labels") cuts_df = cuts_df.with_column( pli.Series(name=category_label, values=labels)) else: cuts_df = cuts_df.with_column( pli.format( "({}, {}]", pli.col(break_point_label).shift_and_fill(1, float("-inf")), pli.col(break_point_label), ).alias(category_label)) cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical)) result = (s.sort().to_frame().join_asof( cuts_df, left_on=var_nm, right_on=break_point_label, strategy="forward", )) return result
def from_pandas( df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"], rechunk: bool = True, nan_to_none: bool = True, ) -> Union[DataFrame, Series]: """ Construct a Polars DataFrame or Series from a pandas DataFrame or Series. Requires the pandas package to be installed. Parameters ---------- df : pandas DataFrame, Series, or DatetimeIndex Data represented as a pandas DataFrame, Series, or DatetimeIndex. rechunk : bool, default True Make sure that all data is contiguous. nan_to_none : bool, default True If data contains NaN values PyArrow will convert the NaN to None Returns ------- DataFrame Examples -------- Constructing a DataFrame from a pandas DataFrame: >>> import pandas as pd >>> pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) >>> df = pl.from_pandas(pd_df) >>> df shape: (2, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╡ │ 1 ┆ 2 ┆ 3 │ ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ │ 4 ┆ 5 ┆ 6 │ └─────┴─────┴─────┘ Constructing a Series from a pandas Series: >>> import pandas as pd >>> pd_series = pd.Series([1, 2, 3], name="pd") >>> df = pl.from_pandas(pd_series) >>> df shape: (3,) Series: 'pd' [i64] [ 1 2 3 ] """ try: import pandas as pd except ImportError as e: # pragma: no cover raise ImportError( "'pandas' is required when using from_pandas().") from e if isinstance(df, (pd.Series, pd.DatetimeIndex)): return Series._from_pandas("", df, nan_to_none=nan_to_none) elif isinstance(df, pd.DataFrame): return DataFrame._from_pandas(df, rechunk=rechunk, nan_to_none=nan_to_none) else: raise ValueError( f"Expected pandas DataFrame or Series, got {type(df)}.")
def from_arrow(a: Union["pa.Table", "pa.Array", "pa.ChunkedArray"], rechunk: bool = True) -> Union[DataFrame, Series]: """ Create a DataFrame or Series from an Arrow Table or Array. This operation will be zero copy for the most part. Types that are not supported by Polars may be cast to the closest supported type. Parameters ---------- a : Arrow Table or Array Data represented as Arrow Table or Array. rechunk : bool, default True Make sure that all data is contiguous. Returns ------- DataFrame or Series Examples -------- Constructing a DataFrame from an Arrow Table: >>> import pyarrow as pa >>> data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> df = pl.from_arrow(data) >>> df shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 4 │ ├╌╌╌╌╌┼╌╌╌╌╌┤ │ 2 ┆ 5 │ ├╌╌╌╌╌┼╌╌╌╌╌┤ │ 3 ┆ 6 │ └─────┴─────┘ Constructing a Series from an Arrow Array: >>> import pyarrow as pa >>> data = pa.array([1, 2, 3]) >>> series = pl.from_arrow(data) >>> series shape: (3,) Series: '' [i64] [ 1 2 3 ] """ if not _PYARROW_AVAILABLE: raise ImportError("'pyarrow' is required when using from_arrow()." ) # pragma: no cover if isinstance(a, pa.Table): return DataFrame._from_arrow(a, rechunk=rechunk) elif isinstance(a, (pa.Array, pa.ChunkedArray)): return Series._from_arrow("", a, rechunk) else: raise ValueError(f"Expected Arrow Table or Array, got {type(a)}.")
def series_to_pyseries(name: str, values: pli.Series) -> PySeries: """ Construct a PySeries from a Polars Series. """ values.rename(name, in_place=True) return values.inner()