Exemple #1
0
    def fill_null(self, fill_value: Union[int, str, "pli.Expr"]) -> "LazyFrame":
        """
        Fill missing values

        Parameters
        ----------
        fill_value
            Value to fill the missing values with
        """
        if not isinstance(fill_value, pli.Expr):
            fill_value = pli.lit(fill_value)
        return wrap_ldf(self._ldf.fill_null(fill_value._pyexpr))
Exemple #2
0
def _assert_series_inner(
    left: pli.Series,
    right: pli.Series,
    check_dtype: bool,
    check_exact: bool,
    nans_compare_equal: bool,
    atol: float,
    rtol: float,
    obj: str,
) -> None:
    """
    Compares Series dtype + values
    """
    try:
        can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__")
    except NotImplementedError:
        can_be_subtracted = False

    check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean
    if check_dtype:
        if left.dtype != right.dtype:
            raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype)

    # create mask of which (if any) values are unequal
    unequal = left != right
    if unequal.any() and nans_compare_equal and left.dtype in (Float32,
                                                               Float64):
        # handle NaN values (which compare unequal to themselves)
        unequal = unequal & ~(
            (left.is_nan() & right.is_nan()).fill_null(pli.lit(False)))

    # assert exact, or with tolerance
    if unequal.any():
        if check_exact:
            raise_assert_detail(obj,
                                "Exact value mismatch",
                                left=list(left),
                                right=list(right))
        else:
            # apply check with tolerance, but only to the known-unequal matches
            left, right = left.filter(unequal), right.filter(unequal)
            if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0:
                raise_assert_detail(obj,
                                    "Value mismatch",
                                    left=list(left),
                                    right=list(right))
Exemple #3
0
    def shift_and_fill(
        self, periods: int, fill_value: Union["pli.Expr", int, str, float]
    ) -> "LazyFrame":
        """
        Shift the values by a given period and fill the parts that will be empty due to this operation
        with the result of the `fill_value` expression.

        Parameters
        ----------
        periods
            Number of places to shift (may be negative).
        fill_value
            fill None values with the result of this expression.
        """
        if not isinstance(fill_value, pli.Expr):
            fill_value = pli.lit(fill_value)
        return wrap_ldf(self._ldf.shift_and_fill(periods, fill_value._pyexpr))
Exemple #4
0
    def fill_nan(self, fill_value: Union[int, str, float, "pli.Expr"]) -> "LazyFrame":
        """
        Fill floating point NaN values.

        ..warning::

            NOTE that floating point NaN (No a Number) are not missing values!
            to replace missing values, use `fill_null`.


        Parameters
        ----------
        fill_value
            Value to fill the NaN values with
        """
        if not isinstance(fill_value, pli.Expr):
            fill_value = pli.lit(fill_value)
        return wrap_ldf(self._ldf.fill_nan(fill_value._pyexpr))
Exemple #5
0
    def with_columns(self, exprs: Union[List["pli.Expr"], "pli.Expr"]) -> "LazyFrame":
        """
        Add or overwrite multiple columns in a DataFrame.

        Parameters
        ----------
        exprs
            List of Expressions that evaluate to columns.
        """
        if isinstance(exprs, pli.Expr):
            return self.with_column(exprs)

        pyexprs = []

        for e in exprs:
            if isinstance(e, pli.Expr):
                pyexprs.append(e._pyexpr)
            elif isinstance(e, pli.Series):
                pyexprs.append(pli.lit(e)._pyexpr)

        return wrap_ldf(self._ldf.with_columns(pyexprs))
Exemple #6
0
def arrow_to_pydf(data: "pa.Table",
                  columns: Optional[ColumnsType] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if not _PYARROW_AVAILABLE:  # pragma: no cover
        raise ImportError(
            "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table."
        )
    original_columns = columns
    columns, dtypes = _unpack_columns(columns)
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    # dictionaries cannot be build in different batches (categorical does not allow that)
    # so we rechunk them and create them separate.
    dictionary_cols = {}
    names = []
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name
        names.append(name)

        column = coerce_arrow(column)
        if pa.types.is_dictionary(column.type):
            ps = arrow_to_pyseries(name, column, rechunk)
            dictionary_cols[i] = pli.wrap_s(ps)
        else:
            data_dict[name] = column

    if len(data_dict) > 0:
        tbl = pa.table(data_dict)

        # path for table without rows that keeps datatype
        if tbl.shape[0] == 0:
            pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df
        else:
            pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches())
    else:
        pydf = pli.DataFrame([])._df
    if rechunk:
        pydf = pydf.rechunk()

    if len(dictionary_cols) > 0:
        df = pli.wrap_df(pydf)
        df = df.with_columns(
            [pli.lit(s).alias(s.name) for s in dictionary_cols.values()])
        df = df[names]
        pydf = df._df

    if dtypes and original_columns:
        pydf = _post_apply_columns(pydf, original_columns)
    return pydf