Example #1
0
def test_validate_bool_kwarg_fail(name, value):
    msg = 'For argument "%s" expected type bool, received type %s' % (
        name,
        type(value).__name__,
    )

    with pytest.raises(ValueError, match=msg):
        validate_bool_kwarg(value, name)
def test_validate_bool_kwarg_fail(name, value):
    msg = (
        f'For argument "{name}" expected type bool, '
        f"received type {type(value).__name__}"
    )

    with pytest.raises(ValueError, match=msg):
        validate_bool_kwarg(value, name)
Example #3
0
 def argmax(self,
            axis: int = 0,
            skipna: bool = True):  # type:ignore[override]
     # override base class by adding axis keyword
     validate_bool_kwarg(skipna, "skipna")
     if not skipna and self.isna().any():
         raise NotImplementedError
     return nargminmax(self, "argmax", axis=axis)
Example #4
0
 def __init__(self, encoderKey, verbose=False, *args, **kwargs):
     super().__init__()
     if encoderKey in __ScalerDict__:
         Encoder = __ScalerDict__[encoderKey](*args)
     else:
         raise raise_PasoError(
             "paso:scale: No scaler named: {} found.".format(encoderKey))
     self.encoderKey = encoderKey
     self.model = Encoder
     validate_bool_kwarg(verbose, "verbose")
     self.verbose = verbose
    def test_validate_bool_kwarg(self):
        arg_names = ['inplace', 'copy']
        invalid_values = [1, "True", [1, 2, 3], 5.0]
        valid_values = [True, False, None]

        for name in arg_names:
            for value in invalid_values:
                with tm.assert_raises_regex(
                        ValueError, "For argument \"%s\" "
                        "expected type bool, "
                        "received type %s" % (name, type(value).__name__)):
                    validate_bool_kwarg(value, name)

            for value in valid_values:
                assert validate_bool_kwarg(value, name) == value
Example #6
0
    def dropna(self,
               axis=0,
               how="any",
               thresh=None,
               subset=None,
               inplace=False):
        axis = self._get_axis_number(axis, 0)
        inplace = validate_bool_kwarg(inplace, "inplace")

        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if how is None and thresh is None:
            raise TypeError("must specify how or thresh")

        if how is not None and how not in ("any", "all"):
            raise ValueError("invalid how option: %s" % how)

        if subset is not None:
            idxr = self.columns.get_indexer_for(subset)
            mask = idxr == -1
            if mask.any():
                raise KeyError(list(np.compress(mask, subset)))
        else:
            idxr = list(range(len(self.columns)))

        if thresh is None:
            thresh = len(idxr) if how == "any" else 1

        new_frame = self._frame.dropna(axis, idxr, thresh)
        return self._create_or_update_frame(new_frame, inplace)
Example #7
0
    def test_validate_bool_kwarg(self):
        arg_names = ['inplace', 'copy']
        invalid_values = [1, "True", [1, 2, 3], 5.0]
        valid_values = [True, False, None]

        for name in arg_names:
            for value in invalid_values:
                with tm.assert_raises_regex(ValueError,
                                            "For argument \"%s\" "
                                            "expected type bool, "
                                            "received type %s" %
                                            (name, type(value).__name__)):
                    validate_bool_kwarg(value, name)

            for value in valid_values:
                assert validate_bool_kwarg(value, name) == value
Example #8
0
    def consolidate(self, inplace=True):
        """
        Internally consolidate chunks of data

        Parameters
        ----------
        inplace : boolean, default True
            Modify the calling object instead of constructing a new one

        Returns
        -------
        splist : SparseList
            If inplace=False, new object, otherwise reference to existing
            object
        """
        inplace = validate_bool_kwarg(inplace, 'inplace')
        if not inplace:
            result = self.copy()
        else:
            result = self

        if result.is_consolidated:
            return result

        result._consolidate_inplace()
        return result
Example #9
0
    def consolidate(self, inplace=True):
        """
        Internally consolidate chunks of data

        Parameters
        ----------
        inplace : boolean, default True
            Modify the calling object instead of constructing a new one

        Returns
        -------
        splist : SparseList
            If inplace=False, new object, otherwise reference to existing
            object
        """
        inplace = validate_bool_kwarg(inplace, 'inplace')
        if not inplace:
            result = self.copy()
        else:
            result = self

        if result.is_consolidated:
            return result

        result._consolidate_inplace()
        return result
Example #10
0
    def fillna(self, value, limit, inplace: bool, downcast) -> ArrayManager:
        # TODO implement downcast
        inplace = validate_bool_kwarg(inplace, "inplace")

        def array_fillna(array, value, limit, inplace):

            mask = isna(array)
            if limit is not None:
                limit = libalgos.validate_limit(None, limit=limit)
                mask[mask.cumsum() > limit] = False

            # TODO could optimize for arrays that cannot hold NAs
            # (like _can_hold_na on Blocks)
            if not inplace:
                array = array.copy()

            # np.putmask(array, mask, value)
            if np.any(mask):
                # TODO allow invalid value if there is nothing to fill?
                array[mask] = value
            return array

        return self.apply(array_fillna,
                          value=value,
                          limit=limit,
                          inplace=inplace)
def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True):
    """
    If we have an object dtype array, try to coerce dates and/or numbers.

    Parameters
    ----------
    values : ndarray
    convert_numeric : bool, default True

    Returns
    -------
    ndarray or DatetimeIndex
    """
    validate_bool_kwarg(convert_numeric, "convert_numeric")

    orig_values = values

    # convert dates
    if is_object_dtype(values.dtype):
        values = lib.maybe_convert_objects(values, convert_datetime=True)

    # convert timedeltas
    if is_object_dtype(values.dtype):
        values = lib.maybe_convert_objects(values, convert_timedelta=True)

    # convert to numeric
    if is_object_dtype(values.dtype):
        if convert_numeric:
            try:
                new_values = lib.maybe_convert_numeric(values,
                                                       set(),
                                                       coerce_numeric=True)

                # if we are all nans then leave me alone
                if not isna(new_values).all():
                    values = new_values

            except Exception:
                pass
        else:
            # soft-conversion
            values = lib.maybe_convert_objects(values)

    if values is orig_values:
        values = values.copy()

    return values
Example #12
0
 def replace(self: T, to_replace, value, inplace: bool) -> T:
     inplace = validate_bool_kwarg(inplace, "inplace")
     assert np.ndim(value) == 0, value
     # TODO "replace" is right now implemented on the blocks, we should move
     # it to general array algos so it can be reused here
     return self.apply_with_block(
         "replace", value=value, to_replace=to_replace, inplace=inplace
     )
Example #13
0
    def _copy_if_else(
        self,
        cond,
        other=None,
        inplace=False,
        axis=None,
        level=None,
        errors="raise",
        try_cast=False,
        negate=False,
    ):
        inplace = validate_bool_kwarg(inplace, "inplace")
        axis = self._get_axis_number(axis, 0)

        if level is not None:
            raise err._unsupported_error("level", level)

        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if try_cast not in (False, ):
            raise err._unsupported_error("try_cast", try_cast)

        # Checks on cond
        cond = self._ensure_valid_frame(cond)

        if self.ndim < cond.ndim:
            raise ValueError(
                "cannot use the higher dimensional dataframe for 'cond'")
        _, cond = self._align_frame(cond, join="left", broadcast_axis=1)

        if any(not is_bool_dtype(dtype) for dtype in cond._get_dtypes()):
            raise ValueError("'cond' must have only boolean values")

        # Checks on other
        if not is_scalar(other):
            other = self._ensure_valid_frame(other)

            if self.ndim < other.ndim:
                raise ValueError(
                    "cannot use the higher dimensional dataframe for 'other'")
            _, other = self._align_frame(other, join="left", broadcast_axis=1)

            for l_dtype, r_dtype in zip(self._get_dtypes(),
                                        other._get_dtypes()):
                if l_dtype != r_dtype:
                    raise ValueError("'other' must have the same type as self")

            other = other._frame

        else:
            other = util.sanitize_scalar(other)

        frame = self._frame.copy_if_else(cond._frame, other, negate=negate)
        return self._create_or_update_frame(frame, inplace)
Example #14
0
    def drop_duplicates(self, keep='first', inplace=False):
        inplace = validate_bool_kwarg(inplace, 'inplace')
        if isinstance(self, ABCIndexClass):
            if self.is_unique:
                return self._shallow_copy()

        duplicated = self.duplicated(keep=keep)
        result = self[np.logical_not(duplicated)]
        if inplace:
            return self._update_inplace(result)
        else:
            return result
Example #15
0
    def drop_duplicates(self, keep='first', inplace=False):
        inplace = validate_bool_kwarg(inplace, 'inplace')
        if isinstance(self, ABCIndexClass):
            if self.is_unique:
                return self._shallow_copy()

        duplicated = self.duplicated(keep=keep)
        result = self[np.logical_not(duplicated)]
        if inplace:
            return self._update_inplace(result)
        else:
            return result
Example #16
0
    def argmax(self, skipna: bool = True) -> int:
        """
        Return the index of maximum value.

        In case of multiple occurrences of the maximum value, the index
        corresponding to the first occurrence is returned.

        Parameters
        ----------
        skipna : bool, default True

        Returns
        -------
        int

        See Also
        --------
        ExtensionArray.argmin
        """
        validate_bool_kwarg(skipna, "skipna")
        if not skipna and self.isna().any():
            raise NotImplementedError
        return nargminmax(self, "argmax")
Example #17
0
    def replace_list(
        self: T,
        src_list: list[Any],
        dest_list: list[Any],
        inplace: bool = False,
        regex: bool = False,
    ) -> T:
        """ do a list replace """
        inplace = validate_bool_kwarg(inplace, "inplace")

        return self.apply_with_block(
            "_replace_list",
            src_list=src_list,
            dest_list=dest_list,
            inplace=inplace,
            regex=regex,
        )
def soft_convert_objects(
    values: np.ndarray,
    datetime: bool = True,
    numeric: bool = True,
    timedelta: bool = True,
    coerce: bool = False,
    copy: bool = True,
):
    """ if we have an object dtype, try to coerce dates and/or numbers """

    validate_bool_kwarg(datetime, "datetime")
    validate_bool_kwarg(numeric, "numeric")
    validate_bool_kwarg(timedelta, "timedelta")
    validate_bool_kwarg(coerce, "coerce")
    validate_bool_kwarg(copy, "copy")

    conversion_count = sum((datetime, numeric, timedelta))
    if conversion_count == 0:
        raise ValueError(
            "At least one of datetime, numeric or timedelta must be True.")
    elif conversion_count > 1 and coerce:
        raise ValueError("Only one of 'datetime', 'numeric' or "
                         "'timedelta' can be True when when coerce=True.")

    if not is_object_dtype(values.dtype):
        # If not object, do not attempt conversion
        values = values.copy() if copy else values
        return values

    # If 1 flag is coerce, ensure 2 others are False
    if coerce:
        # Immediate return if coerce
        if datetime:
            from pandas import to_datetime

            return to_datetime(values, errors="coerce").to_numpy()
        elif timedelta:
            from pandas import to_timedelta

            return to_timedelta(values, errors="coerce").to_numpy()
        elif numeric:
            from pandas import to_numeric

            return to_numeric(values, errors="coerce")

    # Soft conversions
    if datetime:
        # GH 20380, when datetime is beyond year 2262, hence outside
        # bound of nanosecond-resolution 64-bit integers.
        try:
            values = lib.maybe_convert_objects(values, convert_datetime=True)
        except OutOfBoundsDatetime:
            pass

    if timedelta and is_object_dtype(values.dtype):
        # Object check to ensure only run if previous did not convert
        values = lib.maybe_convert_objects(values, convert_timedelta=True)

    if numeric and is_object_dtype(values.dtype):
        try:
            converted = lib.maybe_convert_numeric(values,
                                                  set(),
                                                  coerce_numeric=True)
            # If all NaNs, then do not-alter
            values = converted if not isna(converted).all() else values
            values = values.copy() if copy else values
        except Exception:
            pass

    return values
Example #19
0
    def fillna(
        self,
        value=None,
        method=None,
        axis=None,
        inplace=False,
        limit=None,
        downcast=None,
    ):
        axis = self._get_axis_number(axis, 0)
        inplace = validate_bool_kwarg(inplace, "inplace")

        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if value is None and method is None:
            raise ValueError("must specify a fill method or value")

        if value is not None and method is not None:
            raise ValueError("cannot specify both a fill method and value")

        # Checks on method

        if method is not None:
            raise err._unsupported_error("method", method)

        if method is not None and method not in [
                "backfill",
                "bfill",
                "pad",
                "ffill",
        ]:
            expecting = "pad (ffill) or backfill (bfill)"
            msg = "Invalid fill method. Expecting {expecting}. Got {method}"
            msg = msg.format(expecting=expecting, method=method)
            raise ValueError(msg)

        # Checks on limit

        if limit is not None:
            raise err._unsupported_error("limit", limit)

        if limit is not None:
            if not isinstance(limit, int):
                raise ValueError("Limit must be an integer")
            elif limit <= 0:
                raise ValueError("Limit must be greater than 0")

        # Checks on value

        if isinstance(value, (list, tuple)):
            raise TypeError("'value' parameter must be a scalar or dict, but "
                            f"you passed a {type(value).__name__}")

        if is_scalar(value):
            values = {}
            for idx in range(len(self._get_columns())):
                values[idx] = util.sanitize_scalar(value)

        elif is_dict_like(value):
            if self._is_series:
                raise err._unsupported_error(
                    "'value' cannot be a dict for series")

            values = {}
            for col, val in value.items():
                if not is_scalar(val):
                    raise err._unsupported_error(
                        "'value' must be a dict of scalars for now")
                idxr = self.columns.get_indexer_for([col])
                if idxr[0] != -1:
                    values[idxr[0]] = util.sanitize_scalar(val)

        new_frame = self._frame.fillna(values)
        return self._create_or_update_frame(new_frame, inplace)
Example #20
0
def toContinuousCategory(
    oX: pd.DataFrame,
    features: list = [],
    drop: bool = True,
    int_: bool = True,
    float_: bool = True,
    quantile: bool = True,
    nbin: int = 10,
    inplace: bool = True,
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Transforms any float, continuous integer values of
     a pandas dataframe to category values.

    Parameters:
        X: dataset

    Keywords:

        features:  default: []
            The column  names to  be transform from continuous to category.

        drop: default: True)
            If True then the datetime feature/column will be removed.

        int_: Default: True
            set integer=False if not continuous and not to transform into category.

        float_: Default: True
            set floaty=False if not continuous and not to transform into category.

        quantile: Default: True use quantile bin.
            quantile is simular to v/(maxy-miny), works on any any scale.
            False, use fixed-width bin. miny,maxy arguments are ignored.

        nbin: default: 10
            Alternately ``nbins`` can be integer for number of bins. Or it can be
            array of quantiles, e.g. [0, .25, .5, .75, 1.]
            or array of fixed-width bin boundaries i.e. [0., 4., 10, 100].

        verbose: Default True
            True: output
            False: silent

        inplace: Default: True
            True: replace 1st argument with resulting dataframe
            False:  (boolean)change unplace the dataframe X

    Returns: pd.DataFrame

    Raises:
        TypeError('" requires boolean type.")

    Note:
        Binning, also known as quantization is used for
        transforming continuous numeric features
        (``np.number`` type) into ``category`` type.
        These categories group the continuous values
        into bins. Each bin represents a range of continuous numeric values.
        Specific strategies of binning data include fixed-width
        (``quantile_bins=False``) and adaptive binning (``quantile_bins = True``).

        Datasets that are used as ``train``, ``valid``, and ``test``
        must have same bin widths and labels and thus the
        same categories.

        Assumes **paso** data
        cleaning steps (such as removal of Null and NA values)
        have already been applied.

        Fixed-width bin, only works, WITHOUT SCALING, with datasets with multiple features
        for tree-based models such as CART, random forest, xgboost, lightgbm,
        catboost,etc. Namely Deep Learning using neural nets won't work.
        quantile is similar to min-max scaling:  v/(maxy-miny)
        works on any any scale

        **Statistical problems with linear binning.**

        Binning increases type I and type II error; (simple proof is that as number
        of bins approaches infinity then information loss approaches zero).
        In addition, changing the number of bins will alter the bin distrution shape,
        unless the distribution is uniformLY FLAT.

        **Quantile binning can only be used with a singular data set.**

        Transforming a Continuous featuree ino a Category feature based on percentiles (QUANTILES) is WRONG
        if you have a train and test data sets. Quaniles are based on the data set and will be different unless
        each data set is distribution is equal. In rhe limit there are only two bins,
        then almost no relationship can be modeled. We are essentially doing a t-test.

        **if there are nonlinear or even nonmonotonic relationships between features**

        If you need linear binning, not quantile, use
        ``quantile_bins=False`` and specify the bin width (``delta``) or  fixed bin boundaries
        of any distribution of cuts you wish with ``nbin`` = [ cut-1, cut-2...cut-n ]

        **If you want Quantile-binning.**

        Despite the above warnings, your use case may require. qantile binning.
        Quantile based binning is a faily good strategy to use for adaptive binning.
        Quantiles are specific values or cut-points which partition
        the continuous valued distribution of a feature into
        discrete contiguous bins or intervals. Thus, q-Quantiles
        partition a numeric attribute into q equal (percetage-width) partitions.

        Well-known examples of quantiles include the 2-Quantile ,median,
        divides the data distribution into two equal (percetage-width) bins, 4-Quantiles,
        ,standard quartiles, 4 equal bins (percetage-width) and 10-Quantiles,
        deciles, 10 equal width (percetage-width) bins.

        **You should maybe looking for outliers AFTER applying a Gaussian transformation.**

    """
    _fun_name = toContinuousCategory.__name__
    # todo put in decorator
    if inplace:
        X = oX
    else:
        X = oX.copy()

    validate_bool_kwarg(int_, "int_")
    validate_bool_kwarg(float_, "float_")
    # handles float, continuous integer. set integer=False if not contunuous
    # any other dataframe value type left as is.
    if features == []:
        features = X.columns

    for nth, feature in enumerate(features):
        if (float_ and X[feature].dtype == float) or (int_ and X[feature].dtype
                                                      == int):
            nbin = _must_be_list_tuple_int(nbin)
            # import pdb; pdb.set_trace() # debugging starts here
            if quantile:
                # quantile is similar to min-max scaling:  v/(maxy-miny)
                # works on any any scale
                X[feature + "q"] = pd.qcut(X[feature], nbin, duplicates="drop")
            else:
                # fixed-width bin, only works, WITHOUT SCALING, with datasets with multiple features
                # for tree-based models such as CART, random forest, xgboost, lightgbm,
                X[feature + "fw"] = pd.cut(X[feature], nbin, duplicates="drop")

    # drop feature, if a list and its short, then their is an error.
    # no drop for integer=False or float_=False
    if drop:
        X.drop(features, axis=1, inplace=True)

    if verbose:
        logger.info("{} features:: {}".format(_fun_name, features))

    return X
Example #21
0
File: eval.py Project: Axik/pandas
def eval(expr, parser='pandas', engine=None, truediv=True,
         local_dict=None, global_dict=None, resolvers=(), level=0,
         target=None, inplace=False):
    """Evaluate a Python expression as a string using various backends.

    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
    :keyword:`or`, and :keyword:`not` with the same semantics as the
    corresponding bitwise operators.  :class:`~pandas.Series` and
    :class:`~pandas.DataFrame` objects are supported and behave as they would
    with plain ol' Python evaluation.

    Parameters
    ----------
    expr : str or unicode
        The expression to evaluate. This string cannot contain any Python
        `statements
        <http://docs.python.org/2/reference/simple_stmts.html#simple-statements>`__,
        only Python `expressions
        <http://docs.python.org/2/reference/simple_stmts.html#expression-statements>`__.
    parser : string, default 'pandas', {'pandas', 'python'}
        The parser to use to construct the syntax tree from the expression. The
        default of ``'pandas'`` parses code slightly different than standard
        Python. Alternatively, you can parse an expression using the
        ``'python'`` parser to retain strict Python semantics.  See the
        :ref:`enhancing performance <enhancingperf.eval>` documentation for
        more details.
    engine : string or None, default 'numexpr', {'python', 'numexpr'}

        The engine used to evaluate the expression. Supported engines are

        - None         : tries to use ``numexpr``, falls back to ``python``
        - ``'numexpr'``: This default engine evaluates pandas objects using
                         numexpr for large speed ups in complex expressions
                         with large frames.
        - ``'python'``: Performs operations as if you had ``eval``'d in top
                        level python. This engine is generally not that useful.

        More backends may be available in the future.

    truediv : bool, optional
        Whether to use true division, like in Python >= 3
    local_dict : dict or None, optional
        A dictionary of local variables, taken from locals() by default.
    global_dict : dict or None, optional
        A dictionary of global variables, taken from globals() by default.
    resolvers : list of dict-like or None, optional
        A list of objects implementing the ``__getitem__`` special method that
        you can use to inject an additional collection of namespaces to use for
        variable lookup. For example, this is used in the
        :meth:`~pandas.DataFrame.query` method to inject the
        :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns`
        variables that refer to their respective :class:`~pandas.DataFrame`
        instance attributes.
    level : int, optional
        The number of prior stack frames to traverse and add to the current
        scope. Most users will **not** need to change this parameter.
    target : object, optional, default None
        This is the target object for assignment. It is used when there is
        variable assignment in the expression. If so, then `target` must
        support item assignment with string keys, and if a copy is being
        returned, it must also support `.copy()`.
    inplace : bool, default False
        If `target` is provided, and the expression mutates `target`, whether
        to modify `target` inplace. Otherwise, return a copy of `target` with
        the mutation.

    Returns
    -------
    ndarray, numeric scalar, DataFrame, Series

    Raises
    ------
    ValueError
        There are many instances where such an error can be raised:

        - `target=None`, but the expression is multiline.
        - The expression is multiline, but not all them have item assignment.
          An example of such an arrangement is this:

          a = b + 1
          a + 2

          Here, there are expressions on different lines, making it multiline,
          but the last line has no variable assigned to the output of `a + 2`.
        - `inplace=True`, but the expression is missing item assignment.
        - Item assignment is provided, but the `target` does not support
          string item assignment.
        - Item assignment is provided and `inplace=False`, but the `target`
          does not support the `.copy()` method

    Notes
    -----
    The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
    recursively cast to ``float64``.

    See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
    more details.

    See Also
    --------
    pandas.DataFrame.query
    pandas.DataFrame.eval
    """
    from pandas.core.computation.expr import Expr

    inplace = validate_bool_kwarg(inplace, "inplace")

    if isinstance(expr, string_types):
        _check_expression(expr)
        exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
    else:
        exprs = [expr]
    multi_line = len(exprs) > 1

    if multi_line and target is None:
        raise ValueError("multi-line expressions are only valid in the "
                         "context of data, use DataFrame.eval")

    ret = None
    first_expr = True
    target_modified = False

    for expr in exprs:
        expr = _convert_expression(expr)
        engine = _check_engine(engine)
        _check_parser(parser)
        _check_resolvers(resolvers)
        _check_for_locals(expr, level, parser)

        # get our (possibly passed-in) scope
        env = _ensure_scope(level + 1, global_dict=global_dict,
                            local_dict=local_dict, resolvers=resolvers,
                            target=target)

        parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
                           truediv=truediv)

        # construct the engine and evaluate the parsed expression
        eng = _engines[engine]
        eng_inst = eng(parsed_expr)
        ret = eng_inst.evaluate()

        if parsed_expr.assigner is None:
            if multi_line:
                raise ValueError("Multi-line expressions are only valid"
                                 " if all expressions contain an assignment")
            elif inplace:
                raise ValueError("Cannot operate inplace "
                                 "if there is no assignment")

        # assign if needed
        if env.target is not None and parsed_expr.assigner is not None:
            target_modified = True

            # if returning a copy, copy only on the first assignment
            if not inplace and first_expr:
                try:
                    target = env.target.copy()
                except AttributeError:
                    raise ValueError("Cannot return a copy of the target")
            else:
                target = env.target

            # TypeError is most commonly raised (e.g. int, list), but you
            # get IndexError if you try to do this assignment on np.ndarray.
            try:
                target[parsed_expr.assigner] = ret
            except (TypeError, IndexError):
                raise ValueError("Cannot assign expression output to target")

            if not resolvers:
                resolvers = ({parsed_expr.assigner: ret},)
            else:
                # existing resolver needs updated to handle
                # case of mutating existing column in copy
                for resolver in resolvers:
                    if parsed_expr.assigner in resolver:
                        resolver[parsed_expr.assigner] = ret
                        break
                else:
                    resolvers += ({parsed_expr.assigner: ret},)

            ret = None
            first_expr = False

    # We want to exclude `inplace=None` as being False.
    if inplace is False:
        return target if target_modified else ret
Example #22
0
def eval(expr, parser='pandas', engine=None, truediv=True,
         local_dict=None, global_dict=None, resolvers=(), level=0,
         target=None, inplace=None):
    """Evaluate a Python expression as a string using various backends.

    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
    :keyword:`or`, and :keyword:`not` with the same semantics as the
    corresponding bitwise operators.  :class:`~pandas.Series` and
    :class:`~pandas.DataFrame` objects are supported and behave as they would
    with plain ol' Python evaluation.

    Parameters
    ----------
    expr : str or unicode
        The expression to evaluate. This string cannot contain any Python
        `statements
        <http://docs.python.org/2/reference/simple_stmts.html#simple-statements>`__,
        only Python `expressions
        <http://docs.python.org/2/reference/simple_stmts.html#expression-statements>`__.
    parser : string, default 'pandas', {'pandas', 'python'}
        The parser to use to construct the syntax tree from the expression. The
        default of ``'pandas'`` parses code slightly different than standard
        Python. Alternatively, you can parse an expression using the
        ``'python'`` parser to retain strict Python semantics.  See the
        :ref:`enhancing performance <enhancingperf.eval>` documentation for
        more details.
    engine : string or None, default 'numexpr', {'python', 'numexpr'}

        The engine used to evaluate the expression. Supported engines are

        - None         : tries to use ``numexpr``, falls back to ``python``
        - ``'numexpr'``: This default engine evaluates pandas objects using
                         numexpr for large speed ups in complex expressions
                         with large frames.
        - ``'python'``: Performs operations as if you had ``eval``'d in top
                        level python. This engine is generally not that useful.

        More backends may be available in the future.

    truediv : bool, optional
        Whether to use true division, like in Python >= 3
    local_dict : dict or None, optional
        A dictionary of local variables, taken from locals() by default.
    global_dict : dict or None, optional
        A dictionary of global variables, taken from globals() by default.
    resolvers : list of dict-like or None, optional
        A list of objects implementing the ``__getitem__`` special method that
        you can use to inject an additional collection of namespaces to use for
        variable lookup. For example, this is used in the
        :meth:`~pandas.DataFrame.query` method to inject the
        :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns`
        variables that refer to their respective :class:`~pandas.DataFrame`
        instance attributes.
    level : int, optional
        The number of prior stack frames to traverse and add to the current
        scope. Most users will **not** need to change this parameter.
    target : a target object for assignment, optional, default is None
        essentially this is a passed in resolver
    inplace : bool, default True
        If expression mutates, whether to modify object inplace or return
        copy with mutation.

        WARNING: inplace=None currently falls back to to True, but
        in a future version, will default to False.  Use inplace=True
        explicitly rather than relying on the default.

    Returns
    -------
    ndarray, numeric scalar, DataFrame, Series

    Notes
    -----
    The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
    recursively cast to ``float64``.

    See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
    more details.

    See Also
    --------
    pandas.DataFrame.query
    pandas.DataFrame.eval
    """
    inplace = validate_bool_kwarg(inplace, 'inplace')
    first_expr = True
    if isinstance(expr, string_types):
        _check_expression(expr)
        exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
    else:
        exprs = [expr]
    multi_line = len(exprs) > 1

    if multi_line and target is None:
        raise ValueError("multi-line expressions are only valid in the "
                         "context of data, use DataFrame.eval")

    first_expr = True
    for expr in exprs:
        expr = _convert_expression(expr)
        engine = _check_engine(engine)
        _check_parser(parser)
        _check_resolvers(resolvers)
        _check_for_locals(expr, level, parser)

        # get our (possibly passed-in) scope
        env = _ensure_scope(level + 1, global_dict=global_dict,
                            local_dict=local_dict, resolvers=resolvers,
                            target=target)

        parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
                           truediv=truediv)

        # construct the engine and evaluate the parsed expression
        eng = _engines[engine]
        eng_inst = eng(parsed_expr)
        ret = eng_inst.evaluate()

        if parsed_expr.assigner is None and multi_line:
            raise ValueError("Multi-line expressions are only valid"
                             " if all expressions contain an assignment")

        # assign if needed
        if env.target is not None and parsed_expr.assigner is not None:
            if inplace is None:
                warnings.warn(
                    "eval expressions containing an assignment currently"
                    "default to operating inplace.\nThis will change in "
                    "a future version of pandas, use inplace=True to "
                    "avoid this warning.",
                    FutureWarning, stacklevel=3)
                inplace = True

            # if returning a copy, copy only on the first assignment
            if not inplace and first_expr:
                target = env.target.copy()
            else:
                target = env.target

            target[parsed_expr.assigner] = ret

            if not resolvers:
                resolvers = ({parsed_expr.assigner: ret},)
            else:
                # existing resolver needs updated to handle
                # case of mutating existing column in copy
                for resolver in resolvers:
                    if parsed_expr.assigner in resolver:
                        resolver[parsed_expr.assigner] = ret
                        break
                else:
                    resolvers += ({parsed_expr.assigner: ret},)

            ret = None
            first_expr = False

    if not inplace and inplace is not None:
        return target

    return ret
Example #23
0
def test_validate_bool_kwarg(name, value):
    assert validate_bool_kwarg(value, name) == value
Example #24
0
def eval(expr, parser='pandas', engine=None, truediv=True,
         local_dict=None, global_dict=None, resolvers=(), level=0,
         target=None, inplace=None):
    """Evaluate a Python expression as a string using various backends.

    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
    :keyword:`or`, and :keyword:`not` with the same semantics as the
    corresponding bitwise operators.  :class:`~pandas.Series` and
    :class:`~pandas.DataFrame` objects are supported and behave as they would
    with plain ol' Python evaluation.

    Parameters
    ----------
    expr : str or unicode
        The expression to evaluate. This string cannot contain any Python
        `statements
        <http://docs.python.org/2/reference/simple_stmts.html#simple-statements>`__,
        only Python `expressions
        <http://docs.python.org/2/reference/simple_stmts.html#expression-statements>`__.
    parser : string, default 'pandas', {'pandas', 'python'}
        The parser to use to construct the syntax tree from the expression. The
        default of ``'pandas'`` parses code slightly different than standard
        Python. Alternatively, you can parse an expression using the
        ``'python'`` parser to retain strict Python semantics.  See the
        :ref:`enhancing performance <enhancingperf.eval>` documentation for
        more details.
    engine : string or None, default 'numexpr', {'python', 'numexpr'}

        The engine used to evaluate the expression. Supported engines are

        - None         : tries to use ``numexpr``, falls back to ``python``
        - ``'numexpr'``: This default engine evaluates pandas objects using
                         numexpr for large speed ups in complex expressions
                         with large frames.
        - ``'python'``: Performs operations as if you had ``eval``'d in top
                        level python. This engine is generally not that useful.

        More backends may be available in the future.

    truediv : bool, optional
        Whether to use true division, like in Python >= 3
    local_dict : dict or None, optional
        A dictionary of local variables, taken from locals() by default.
    global_dict : dict or None, optional
        A dictionary of global variables, taken from globals() by default.
    resolvers : list of dict-like or None, optional
        A list of objects implementing the ``__getitem__`` special method that
        you can use to inject an additional collection of namespaces to use for
        variable lookup. For example, this is used in the
        :meth:`~pandas.DataFrame.query` method to inject the
        :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns`
        variables that refer to their respective :class:`~pandas.DataFrame`
        instance attributes.
    level : int, optional
        The number of prior stack frames to traverse and add to the current
        scope. Most users will **not** need to change this parameter.
    target : a target object for assignment, optional, default is None
        essentially this is a passed in resolver
    inplace : bool, default True
        If expression mutates, whether to modify object inplace or return
        copy with mutation.

        WARNING: inplace=None currently falls back to to True, but
        in a future version, will default to False.  Use inplace=True
        explicitly rather than relying on the default.

    Returns
    -------
    ndarray, numeric scalar, DataFrame, Series

    Notes
    -----
    The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
    recursively cast to ``float64``.

    See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
    more details.

    See Also
    --------
    pandas.DataFrame.query
    pandas.DataFrame.eval
    """
    inplace = validate_bool_kwarg(inplace, 'inplace')
    first_expr = True
    if isinstance(expr, string_types):
        _check_expression(expr)
        exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
    else:
        exprs = [expr]
    multi_line = len(exprs) > 1

    if multi_line and target is None:
        raise ValueError("multi-line expressions are only valid in the "
                         "context of data, use DataFrame.eval")

    first_expr = True
    for expr in exprs:
        expr = _convert_expression(expr)
        engine = _check_engine(engine)
        _check_parser(parser)
        _check_resolvers(resolvers)
        _check_for_locals(expr, level, parser)

        # get our (possibly passed-in) scope
        env = _ensure_scope(level + 1, global_dict=global_dict,
                            local_dict=local_dict, resolvers=resolvers,
                            target=target)

        parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
                           truediv=truediv)

        # construct the engine and evaluate the parsed expression
        eng = _engines[engine]
        eng_inst = eng(parsed_expr)
        ret = eng_inst.evaluate()

        if parsed_expr.assigner is None and multi_line:
            raise ValueError("Multi-line expressions are only valid"
                             " if all expressions contain an assignment")

        # assign if needed
        if env.target is not None and parsed_expr.assigner is not None:
            if inplace is None:
                warnings.warn(
                    "eval expressions containing an assignment currently"
                    "default to operating inplace.\nThis will change in "
                    "a future version of pandas, use inplace=True to "
                    "avoid this warning.",
                    FutureWarning, stacklevel=3)
                inplace = True

            # if returning a copy, copy only on the first assignment
            if not inplace and first_expr:
                target = env.target.copy()
            else:
                target = env.target

            target[parsed_expr.assigner] = ret

            if not resolvers:
                resolvers = ({parsed_expr.assigner: ret},)
            else:
                # existing resolver needs updated to handle
                # case of mutating existing column in copy
                for resolver in resolvers:
                    if parsed_expr.assigner in resolver:
                        resolver[parsed_expr.assigner] = ret
                        break
                else:
                    resolvers += ({parsed_expr.assigner: ret},)

            ret = None
            first_expr = False

    if not inplace and inplace is not None:
        return target

    return ret
def test_validate_bool_kwarg(name, value):
    assert validate_bool_kwarg(value, name) == value
def test_validate_bool_kwarg_fail(name, value):
    msg = ("For argument \"%s\" expected type bool, received type %s" %
           (name, type(value).__name__))

    with pytest.raises(ValueError, match=msg):
        validate_bool_kwarg(value, name)
Example #27
0
def eval(expr,
         parser='pandas',
         engine=None,
         truediv=True,
         local_dict=None,
         global_dict=None,
         resolvers=(),
         level=0,
         target=None,
         inplace=False):
    """Evaluate a Python expression as a string using various backends.

    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
    :keyword:`or`, and :keyword:`not` with the same semantics as the
    corresponding bitwise operators.  :class:`~pandas.Series` and
    :class:`~pandas.DataFrame` objects are supported and behave as they would
    with plain ol' Python evaluation.

    Parameters
    ----------
    expr : str or unicode
        The expression to evaluate. This string cannot contain any Python
        `statements
        <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
        only Python `expressions
        <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
    parser : string, default 'pandas', {'pandas', 'python'}
        The parser to use to construct the syntax tree from the expression. The
        default of ``'pandas'`` parses code slightly different than standard
        Python. Alternatively, you can parse an expression using the
        ``'python'`` parser to retain strict Python semantics.  See the
        :ref:`enhancing performance <enhancingperf.eval>` documentation for
        more details.
    engine : string or None, default 'numexpr', {'python', 'numexpr'}

        The engine used to evaluate the expression. Supported engines are

        - None         : tries to use ``numexpr``, falls back to ``python``
        - ``'numexpr'``: This default engine evaluates pandas objects using
                         numexpr for large speed ups in complex expressions
                         with large frames.
        - ``'python'``: Performs operations as if you had ``eval``'d in top
                        level python. This engine is generally not that useful.

        More backends may be available in the future.

    truediv : bool, optional
        Whether to use true division, like in Python >= 3
    local_dict : dict or None, optional
        A dictionary of local variables, taken from locals() by default.
    global_dict : dict or None, optional
        A dictionary of global variables, taken from globals() by default.
    resolvers : list of dict-like or None, optional
        A list of objects implementing the ``__getitem__`` special method that
        you can use to inject an additional collection of namespaces to use for
        variable lookup. For example, this is used in the
        :meth:`~pandas.DataFrame.query` method to inject the
        ``DataFrame.index`` and ``DataFrame.columns``
        variables that refer to their respective :class:`~pandas.DataFrame`
        instance attributes.
    level : int, optional
        The number of prior stack frames to traverse and add to the current
        scope. Most users will **not** need to change this parameter.
    target : object, optional, default None
        This is the target object for assignment. It is used when there is
        variable assignment in the expression. If so, then `target` must
        support item assignment with string keys, and if a copy is being
        returned, it must also support `.copy()`.
    inplace : bool, default False
        If `target` is provided, and the expression mutates `target`, whether
        to modify `target` inplace. Otherwise, return a copy of `target` with
        the mutation.

    Returns
    -------
    ndarray, numeric scalar, DataFrame, Series

    Raises
    ------
    ValueError
        There are many instances where such an error can be raised:

        - `target=None`, but the expression is multiline.
        - The expression is multiline, but not all them have item assignment.
          An example of such an arrangement is this:

          a = b + 1
          a + 2

          Here, there are expressions on different lines, making it multiline,
          but the last line has no variable assigned to the output of `a + 2`.
        - `inplace=True`, but the expression is missing item assignment.
        - Item assignment is provided, but the `target` does not support
          string item assignment.
        - Item assignment is provided and `inplace=False`, but the `target`
          does not support the `.copy()` method

    Notes
    -----
    The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
    recursively cast to ``float64``.

    See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
    more details.

    See Also
    --------
    pandas.DataFrame.query
    pandas.DataFrame.eval
    """
    from pandas.core.computation.expr import Expr

    inplace = validate_bool_kwarg(inplace, "inplace")

    if isinstance(expr, string_types):
        _check_expression(expr)
        exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
    else:
        exprs = [expr]
    multi_line = len(exprs) > 1

    if multi_line and target is None:
        raise ValueError("multi-line expressions are only valid in the "
                         "context of data, use DataFrame.eval")

    ret = None
    first_expr = True
    target_modified = False

    for expr in exprs:
        expr = _convert_expression(expr)
        engine = _check_engine(engine)
        _check_parser(parser)
        _check_resolvers(resolvers)
        _check_for_locals(expr, level, parser)

        # get our (possibly passed-in) scope
        env = _ensure_scope(level + 1,
                            global_dict=global_dict,
                            local_dict=local_dict,
                            resolvers=resolvers,
                            target=target)

        parsed_expr = Expr(expr,
                           engine=engine,
                           parser=parser,
                           env=env,
                           truediv=truediv)

        # construct the engine and evaluate the parsed expression
        eng = _engines[engine]
        eng_inst = eng(parsed_expr)
        ret = eng_inst.evaluate()

        if parsed_expr.assigner is None:
            if multi_line:
                raise ValueError("Multi-line expressions are only valid"
                                 " if all expressions contain an assignment")
            elif inplace:
                raise ValueError("Cannot operate inplace "
                                 "if there is no assignment")

        # assign if needed
        assigner = parsed_expr.assigner
        if env.target is not None and assigner is not None:
            target_modified = True

            # if returning a copy, copy only on the first assignment
            if not inplace and first_expr:
                try:
                    target = env.target.copy()
                except AttributeError:
                    raise ValueError("Cannot return a copy of the target")
            else:
                target = env.target

            # TypeError is most commonly raised (e.g. int, list), but you
            # get IndexError if you try to do this assignment on np.ndarray.
            # we will ignore numpy warnings here; e.g. if trying
            # to use a non-numeric indexer
            try:
                with warnings.catch_warnings(record=True):
                    target[assigner] = ret
            except (TypeError, IndexError):
                raise ValueError("Cannot assign expression output to target")

            if not resolvers:
                resolvers = ({assigner: ret}, )
            else:
                # existing resolver needs updated to handle
                # case of mutating existing column in copy
                for resolver in resolvers:
                    if assigner in resolver:
                        resolver[assigner] = ret
                        break
                else:
                    resolvers += ({assigner: ret}, )

            ret = None
            first_expr = False

    # We want to exclude `inplace=None` as being False.
    if inplace is False:
        return target if target_modified else ret