def test_validate_bool_kwarg_fail(name, value): msg = 'For argument "%s" expected type bool, received type %s' % ( name, type(value).__name__, ) with pytest.raises(ValueError, match=msg): validate_bool_kwarg(value, name)
def test_validate_bool_kwarg_fail(name, value): msg = ( f'For argument "{name}" expected type bool, ' f"received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): validate_bool_kwarg(value, name)
def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError return nargminmax(self, "argmax", axis=axis)
def __init__(self, encoderKey, verbose=False, *args, **kwargs): super().__init__() if encoderKey in __ScalerDict__: Encoder = __ScalerDict__[encoderKey](*args) else: raise raise_PasoError( "paso:scale: No scaler named: {} found.".format(encoderKey)) self.encoderKey = encoderKey self.model = Encoder validate_bool_kwarg(verbose, "verbose") self.verbose = verbose
def test_validate_bool_kwarg(self): arg_names = ['inplace', 'copy'] invalid_values = [1, "True", [1, 2, 3], 5.0] valid_values = [True, False, None] for name in arg_names: for value in invalid_values: with tm.assert_raises_regex( ValueError, "For argument \"%s\" " "expected type bool, " "received type %s" % (name, type(value).__name__)): validate_bool_kwarg(value, name) for value in valid_values: assert validate_bool_kwarg(value, name) == value
def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): axis = self._get_axis_number(axis, 0) inplace = validate_bool_kwarg(inplace, "inplace") if axis not in (0, ): raise err._unsupported_error("axis", axis) if how is None and thresh is None: raise TypeError("must specify how or thresh") if how is not None and how not in ("any", "all"): raise ValueError("invalid how option: %s" % how) if subset is not None: idxr = self.columns.get_indexer_for(subset) mask = idxr == -1 if mask.any(): raise KeyError(list(np.compress(mask, subset))) else: idxr = list(range(len(self.columns))) if thresh is None: thresh = len(idxr) if how == "any" else 1 new_frame = self._frame.dropna(axis, idxr, thresh) return self._create_or_update_frame(new_frame, inplace)
def test_validate_bool_kwarg(self): arg_names = ['inplace', 'copy'] invalid_values = [1, "True", [1, 2, 3], 5.0] valid_values = [True, False, None] for name in arg_names: for value in invalid_values: with tm.assert_raises_regex(ValueError, "For argument \"%s\" " "expected type bool, " "received type %s" % (name, type(value).__name__)): validate_bool_kwarg(value, name) for value in valid_values: assert validate_bool_kwarg(value, name) == value
def consolidate(self, inplace=True): """ Internally consolidate chunks of data Parameters ---------- inplace : boolean, default True Modify the calling object instead of constructing a new one Returns ------- splist : SparseList If inplace=False, new object, otherwise reference to existing object """ inplace = validate_bool_kwarg(inplace, 'inplace') if not inplace: result = self.copy() else: result = self if result.is_consolidated: return result result._consolidate_inplace() return result
def fillna(self, value, limit, inplace: bool, downcast) -> ArrayManager: # TODO implement downcast inplace = validate_bool_kwarg(inplace, "inplace") def array_fillna(array, value, limit, inplace): mask = isna(array) if limit is not None: limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum() > limit] = False # TODO could optimize for arrays that cannot hold NAs # (like _can_hold_na on Blocks) if not inplace: array = array.copy() # np.putmask(array, mask, value) if np.any(mask): # TODO allow invalid value if there is nothing to fill? array[mask] = value return array return self.apply(array_fillna, value=value, limit=limit, inplace=inplace)
def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): """ If we have an object dtype array, try to coerce dates and/or numbers. Parameters ---------- values : ndarray convert_numeric : bool, default True Returns ------- ndarray or DatetimeIndex """ validate_bool_kwarg(convert_numeric, "convert_numeric") orig_values = values # convert dates if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, convert_datetime=True) # convert timedeltas if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, convert_timedelta=True) # convert to numeric if is_object_dtype(values.dtype): if convert_numeric: try: new_values = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # if we are all nans then leave me alone if not isna(new_values).all(): values = new_values except Exception: pass else: # soft-conversion values = lib.maybe_convert_objects(values) if values is orig_values: values = values.copy() return values
def replace(self: T, to_replace, value, inplace: bool) -> T: inplace = validate_bool_kwarg(inplace, "inplace") assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move # it to general array algos so it can be reused here return self.apply_with_block( "replace", value=value, to_replace=to_replace, inplace=inplace )
def _copy_if_else( self, cond, other=None, inplace=False, axis=None, level=None, errors="raise", try_cast=False, negate=False, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis, 0) if level is not None: raise err._unsupported_error("level", level) if axis not in (0, ): raise err._unsupported_error("axis", axis) if try_cast not in (False, ): raise err._unsupported_error("try_cast", try_cast) # Checks on cond cond = self._ensure_valid_frame(cond) if self.ndim < cond.ndim: raise ValueError( "cannot use the higher dimensional dataframe for 'cond'") _, cond = self._align_frame(cond, join="left", broadcast_axis=1) if any(not is_bool_dtype(dtype) for dtype in cond._get_dtypes()): raise ValueError("'cond' must have only boolean values") # Checks on other if not is_scalar(other): other = self._ensure_valid_frame(other) if self.ndim < other.ndim: raise ValueError( "cannot use the higher dimensional dataframe for 'other'") _, other = self._align_frame(other, join="left", broadcast_axis=1) for l_dtype, r_dtype in zip(self._get_dtypes(), other._get_dtypes()): if l_dtype != r_dtype: raise ValueError("'other' must have the same type as self") other = other._frame else: other = util.sanitize_scalar(other) frame = self._frame.copy_if_else(cond._frame, other, negate=negate) return self._create_or_update_frame(frame, inplace)
def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if isinstance(self, ABCIndexClass): if self.is_unique: return self._shallow_copy() duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: return self._update_inplace(result) else: return result
def argmax(self, skipna: bool = True) -> int: """ Return the index of maximum value. In case of multiple occurrences of the maximum value, the index corresponding to the first occurrence is returned. Parameters ---------- skipna : bool, default True Returns ------- int See Also -------- ExtensionArray.argmin """ validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError return nargminmax(self, "argmax")
def replace_list( self: T, src_list: list[Any], dest_list: list[Any], inplace: bool = False, regex: bool = False, ) -> T: """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") return self.apply_with_block( "_replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, regex=regex, )
def soft_convert_objects( values: np.ndarray, datetime: bool = True, numeric: bool = True, timedelta: bool = True, coerce: bool = False, copy: bool = True, ): """ if we have an object dtype, try to coerce dates and/or numbers """ validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: raise ValueError( "At least one of datetime, numeric or timedelta must be True.") elif conversion_count > 1 and coerce: raise ValueError("Only one of 'datetime', 'numeric' or " "'timedelta' can be True when when coerce=True.") if not is_object_dtype(values.dtype): # If not object, do not attempt conversion values = values.copy() if copy else values return values # If 1 flag is coerce, ensure 2 others are False if coerce: # Immediate return if coerce if datetime: from pandas import to_datetime return to_datetime(values, errors="coerce").to_numpy() elif timedelta: from pandas import to_timedelta return to_timedelta(values, errors="coerce").to_numpy() elif numeric: from pandas import to_numeric return to_numeric(values, errors="coerce") # Soft conversions if datetime: # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: values = lib.maybe_convert_objects(values, convert_datetime=True) except OutOfBoundsDatetime: pass if timedelta and is_object_dtype(values.dtype): # Object check to ensure only run if previous did not convert values = lib.maybe_convert_objects(values, convert_timedelta=True) if numeric and is_object_dtype(values.dtype): try: converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values except Exception: pass return values
def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, ): axis = self._get_axis_number(axis, 0) inplace = validate_bool_kwarg(inplace, "inplace") if axis not in (0, ): raise err._unsupported_error("axis", axis) if value is None and method is None: raise ValueError("must specify a fill method or value") if value is not None and method is not None: raise ValueError("cannot specify both a fill method and value") # Checks on method if method is not None: raise err._unsupported_error("method", method) if method is not None and method not in [ "backfill", "bfill", "pad", "ffill", ]: expecting = "pad (ffill) or backfill (bfill)" msg = "Invalid fill method. Expecting {expecting}. Got {method}" msg = msg.format(expecting=expecting, method=method) raise ValueError(msg) # Checks on limit if limit is not None: raise err._unsupported_error("limit", limit) if limit is not None: if not isinstance(limit, int): raise ValueError("Limit must be an integer") elif limit <= 0: raise ValueError("Limit must be greater than 0") # Checks on value if isinstance(value, (list, tuple)): raise TypeError("'value' parameter must be a scalar or dict, but " f"you passed a {type(value).__name__}") if is_scalar(value): values = {} for idx in range(len(self._get_columns())): values[idx] = util.sanitize_scalar(value) elif is_dict_like(value): if self._is_series: raise err._unsupported_error( "'value' cannot be a dict for series") values = {} for col, val in value.items(): if not is_scalar(val): raise err._unsupported_error( "'value' must be a dict of scalars for now") idxr = self.columns.get_indexer_for([col]) if idxr[0] != -1: values[idxr[0]] = util.sanitize_scalar(val) new_frame = self._frame.fillna(values) return self._create_or_update_frame(new_frame, inplace)
def toContinuousCategory( oX: pd.DataFrame, features: list = [], drop: bool = True, int_: bool = True, float_: bool = True, quantile: bool = True, nbin: int = 10, inplace: bool = True, verbose: bool = True, ) -> pd.DataFrame: """ Transforms any float, continuous integer values of a pandas dataframe to category values. Parameters: X: dataset Keywords: features: default: [] The column names to be transform from continuous to category. drop: default: True) If True then the datetime feature/column will be removed. int_: Default: True set integer=False if not continuous and not to transform into category. float_: Default: True set floaty=False if not continuous and not to transform into category. quantile: Default: True use quantile bin. quantile is simular to v/(maxy-miny), works on any any scale. False, use fixed-width bin. miny,maxy arguments are ignored. nbin: default: 10 Alternately ``nbins`` can be integer for number of bins. Or it can be array of quantiles, e.g. [0, .25, .5, .75, 1.] or array of fixed-width bin boundaries i.e. [0., 4., 10, 100]. verbose: Default True True: output False: silent inplace: Default: True True: replace 1st argument with resulting dataframe False: (boolean)change unplace the dataframe X Returns: pd.DataFrame Raises: TypeError('" requires boolean type.") Note: Binning, also known as quantization is used for transforming continuous numeric features (``np.number`` type) into ``category`` type. These categories group the continuous values into bins. Each bin represents a range of continuous numeric values. Specific strategies of binning data include fixed-width (``quantile_bins=False``) and adaptive binning (``quantile_bins = True``). Datasets that are used as ``train``, ``valid``, and ``test`` must have same bin widths and labels and thus the same categories. Assumes **paso** data cleaning steps (such as removal of Null and NA values) have already been applied. Fixed-width bin, only works, WITHOUT SCALING, with datasets with multiple features for tree-based models such as CART, random forest, xgboost, lightgbm, catboost,etc. Namely Deep Learning using neural nets won't work. quantile is similar to min-max scaling: v/(maxy-miny) works on any any scale **Statistical problems with linear binning.** Binning increases type I and type II error; (simple proof is that as number of bins approaches infinity then information loss approaches zero). In addition, changing the number of bins will alter the bin distrution shape, unless the distribution is uniformLY FLAT. **Quantile binning can only be used with a singular data set.** Transforming a Continuous featuree ino a Category feature based on percentiles (QUANTILES) is WRONG if you have a train and test data sets. Quaniles are based on the data set and will be different unless each data set is distribution is equal. In rhe limit there are only two bins, then almost no relationship can be modeled. We are essentially doing a t-test. **if there are nonlinear or even nonmonotonic relationships between features** If you need linear binning, not quantile, use ``quantile_bins=False`` and specify the bin width (``delta``) or fixed bin boundaries of any distribution of cuts you wish with ``nbin`` = [ cut-1, cut-2...cut-n ] **If you want Quantile-binning.** Despite the above warnings, your use case may require. qantile binning. Quantile based binning is a faily good strategy to use for adaptive binning. Quantiles are specific values or cut-points which partition the continuous valued distribution of a feature into discrete contiguous bins or intervals. Thus, q-Quantiles partition a numeric attribute into q equal (percetage-width) partitions. Well-known examples of quantiles include the 2-Quantile ,median, divides the data distribution into two equal (percetage-width) bins, 4-Quantiles, ,standard quartiles, 4 equal bins (percetage-width) and 10-Quantiles, deciles, 10 equal width (percetage-width) bins. **You should maybe looking for outliers AFTER applying a Gaussian transformation.** """ _fun_name = toContinuousCategory.__name__ # todo put in decorator if inplace: X = oX else: X = oX.copy() validate_bool_kwarg(int_, "int_") validate_bool_kwarg(float_, "float_") # handles float, continuous integer. set integer=False if not contunuous # any other dataframe value type left as is. if features == []: features = X.columns for nth, feature in enumerate(features): if (float_ and X[feature].dtype == float) or (int_ and X[feature].dtype == int): nbin = _must_be_list_tuple_int(nbin) # import pdb; pdb.set_trace() # debugging starts here if quantile: # quantile is similar to min-max scaling: v/(maxy-miny) # works on any any scale X[feature + "q"] = pd.qcut(X[feature], nbin, duplicates="drop") else: # fixed-width bin, only works, WITHOUT SCALING, with datasets with multiple features # for tree-based models such as CART, random forest, xgboost, lightgbm, X[feature + "fw"] = pd.cut(X[feature], nbin, duplicates="drop") # drop feature, if a list and its short, then their is an error. # no drop for integer=False or float_=False if drop: X.drop(features, axis=1, inplace=True) if verbose: logger.info("{} features:: {}".format(_fun_name, features)) return X
def eval(expr, parser='pandas', engine=None, truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, :keyword:`or`, and :keyword:`not` with the same semantics as the corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. Parameters ---------- expr : str or unicode The expression to evaluate. This string cannot contain any Python `statements <http://docs.python.org/2/reference/simple_stmts.html#simple-statements>`__, only Python `expressions <http://docs.python.org/2/reference/simple_stmts.html#expression-statements>`__. parser : string, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard Python. Alternatively, you can parse an expression using the ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. engine : string or None, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are - None : tries to use ``numexpr``, falls back to ``python`` - ``'numexpr'``: This default engine evaluates pandas objects using numexpr for large speed ups in complex expressions with large frames. - ``'python'``: Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. More backends may be available in the future. truediv : bool, optional Whether to use true division, like in Python >= 3 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. resolvers : list of dict-like or None, optional A list of objects implementing the ``__getitem__`` special method that you can use to inject an additional collection of namespaces to use for variable lookup. For example, this is used in the :meth:`~pandas.DataFrame.query` method to inject the :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. target : object, optional, default None This is the target object for assignment. It is used when there is variable assignment in the expression. If so, then `target` must support item assignment with string keys, and if a copy is being returned, it must also support `.copy()`. inplace : bool, default False If `target` is provided, and the expression mutates `target`, whether to modify `target` inplace. Otherwise, return a copy of `target` with the mutation. Returns ------- ndarray, numeric scalar, DataFrame, Series Raises ------ ValueError There are many instances where such an error can be raised: - `target=None`, but the expression is multiline. - The expression is multiline, but not all them have item assignment. An example of such an arrangement is this: a = b + 1 a + 2 Here, there are expressions on different lines, making it multiline, but the last line has no variable assigned to the output of `a + 2`. - `inplace=True`, but the expression is missing item assignment. - Item assignment is provided, but the `target` does not support string item assignment. - Item assignment is provided and `inplace=False`, but the `target` does not support the `.copy()` method Notes ----- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are recursively cast to ``float64``. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. See Also -------- pandas.DataFrame.query pandas.DataFrame.eval """ from pandas.core.computation.expr import Expr inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(expr, string_types): _check_expression(expr) exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] else: exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: raise ValueError("multi-line expressions are only valid in the " "context of data, use DataFrame.eval") ret = None first_expr = True target_modified = False for expr in exprs: expr = _convert_expression(expr) engine = _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope env = _ensure_scope(level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) # construct the engine and evaluate the parsed expression eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() if parsed_expr.assigner is None: if multi_line: raise ValueError("Multi-line expressions are only valid" " if all expressions contain an assignment") elif inplace: raise ValueError("Cannot operate inplace " "if there is no assignment") # assign if needed if env.target is not None and parsed_expr.assigner is not None: target_modified = True # if returning a copy, copy only on the first assignment if not inplace and first_expr: try: target = env.target.copy() except AttributeError: raise ValueError("Cannot return a copy of the target") else: target = env.target # TypeError is most commonly raised (e.g. int, list), but you # get IndexError if you try to do this assignment on np.ndarray. try: target[parsed_expr.assigner] = ret except (TypeError, IndexError): raise ValueError("Cannot assign expression output to target") if not resolvers: resolvers = ({parsed_expr.assigner: ret},) else: # existing resolver needs updated to handle # case of mutating existing column in copy for resolver in resolvers: if parsed_expr.assigner in resolver: resolver[parsed_expr.assigner] = ret break else: resolvers += ({parsed_expr.assigner: ret},) ret = None first_expr = False # We want to exclude `inplace=None` as being False. if inplace is False: return target if target_modified else ret
def eval(expr, parser='pandas', engine=None, truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=None): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, :keyword:`or`, and :keyword:`not` with the same semantics as the corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. Parameters ---------- expr : str or unicode The expression to evaluate. This string cannot contain any Python `statements <http://docs.python.org/2/reference/simple_stmts.html#simple-statements>`__, only Python `expressions <http://docs.python.org/2/reference/simple_stmts.html#expression-statements>`__. parser : string, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard Python. Alternatively, you can parse an expression using the ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. engine : string or None, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are - None : tries to use ``numexpr``, falls back to ``python`` - ``'numexpr'``: This default engine evaluates pandas objects using numexpr for large speed ups in complex expressions with large frames. - ``'python'``: Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. More backends may be available in the future. truediv : bool, optional Whether to use true division, like in Python >= 3 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. resolvers : list of dict-like or None, optional A list of objects implementing the ``__getitem__`` special method that you can use to inject an additional collection of namespaces to use for variable lookup. For example, this is used in the :meth:`~pandas.DataFrame.query` method to inject the :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. target : a target object for assignment, optional, default is None essentially this is a passed in resolver inplace : bool, default True If expression mutates, whether to modify object inplace or return copy with mutation. WARNING: inplace=None currently falls back to to True, but in a future version, will default to False. Use inplace=True explicitly rather than relying on the default. Returns ------- ndarray, numeric scalar, DataFrame, Series Notes ----- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are recursively cast to ``float64``. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. See Also -------- pandas.DataFrame.query pandas.DataFrame.eval """ inplace = validate_bool_kwarg(inplace, 'inplace') first_expr = True if isinstance(expr, string_types): _check_expression(expr) exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] else: exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: raise ValueError("multi-line expressions are only valid in the " "context of data, use DataFrame.eval") first_expr = True for expr in exprs: expr = _convert_expression(expr) engine = _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope env = _ensure_scope(level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) # construct the engine and evaluate the parsed expression eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() if parsed_expr.assigner is None and multi_line: raise ValueError("Multi-line expressions are only valid" " if all expressions contain an assignment") # assign if needed if env.target is not None and parsed_expr.assigner is not None: if inplace is None: warnings.warn( "eval expressions containing an assignment currently" "default to operating inplace.\nThis will change in " "a future version of pandas, use inplace=True to " "avoid this warning.", FutureWarning, stacklevel=3) inplace = True # if returning a copy, copy only on the first assignment if not inplace and first_expr: target = env.target.copy() else: target = env.target target[parsed_expr.assigner] = ret if not resolvers: resolvers = ({parsed_expr.assigner: ret},) else: # existing resolver needs updated to handle # case of mutating existing column in copy for resolver in resolvers: if parsed_expr.assigner in resolver: resolver[parsed_expr.assigner] = ret break else: resolvers += ({parsed_expr.assigner: ret},) ret = None first_expr = False if not inplace and inplace is not None: return target return ret
def test_validate_bool_kwarg(name, value): assert validate_bool_kwarg(value, name) == value
def test_validate_bool_kwarg_fail(name, value): msg = ("For argument \"%s\" expected type bool, received type %s" % (name, type(value).__name__)) with pytest.raises(ValueError, match=msg): validate_bool_kwarg(value, name)
def eval(expr, parser='pandas', engine=None, truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, :keyword:`or`, and :keyword:`not` with the same semantics as the corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. Parameters ---------- expr : str or unicode The expression to evaluate. This string cannot contain any Python `statements <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__, only Python `expressions <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__. parser : string, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard Python. Alternatively, you can parse an expression using the ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. engine : string or None, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are - None : tries to use ``numexpr``, falls back to ``python`` - ``'numexpr'``: This default engine evaluates pandas objects using numexpr for large speed ups in complex expressions with large frames. - ``'python'``: Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. More backends may be available in the future. truediv : bool, optional Whether to use true division, like in Python >= 3 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. resolvers : list of dict-like or None, optional A list of objects implementing the ``__getitem__`` special method that you can use to inject an additional collection of namespaces to use for variable lookup. For example, this is used in the :meth:`~pandas.DataFrame.query` method to inject the ``DataFrame.index`` and ``DataFrame.columns`` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. target : object, optional, default None This is the target object for assignment. It is used when there is variable assignment in the expression. If so, then `target` must support item assignment with string keys, and if a copy is being returned, it must also support `.copy()`. inplace : bool, default False If `target` is provided, and the expression mutates `target`, whether to modify `target` inplace. Otherwise, return a copy of `target` with the mutation. Returns ------- ndarray, numeric scalar, DataFrame, Series Raises ------ ValueError There are many instances where such an error can be raised: - `target=None`, but the expression is multiline. - The expression is multiline, but not all them have item assignment. An example of such an arrangement is this: a = b + 1 a + 2 Here, there are expressions on different lines, making it multiline, but the last line has no variable assigned to the output of `a + 2`. - `inplace=True`, but the expression is missing item assignment. - Item assignment is provided, but the `target` does not support string item assignment. - Item assignment is provided and `inplace=False`, but the `target` does not support the `.copy()` method Notes ----- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are recursively cast to ``float64``. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. See Also -------- pandas.DataFrame.query pandas.DataFrame.eval """ from pandas.core.computation.expr import Expr inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(expr, string_types): _check_expression(expr) exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] else: exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: raise ValueError("multi-line expressions are only valid in the " "context of data, use DataFrame.eval") ret = None first_expr = True target_modified = False for expr in exprs: expr = _convert_expression(expr) engine = _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope env = _ensure_scope(level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) # construct the engine and evaluate the parsed expression eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() if parsed_expr.assigner is None: if multi_line: raise ValueError("Multi-line expressions are only valid" " if all expressions contain an assignment") elif inplace: raise ValueError("Cannot operate inplace " "if there is no assignment") # assign if needed assigner = parsed_expr.assigner if env.target is not None and assigner is not None: target_modified = True # if returning a copy, copy only on the first assignment if not inplace and first_expr: try: target = env.target.copy() except AttributeError: raise ValueError("Cannot return a copy of the target") else: target = env.target # TypeError is most commonly raised (e.g. int, list), but you # get IndexError if you try to do this assignment on np.ndarray. # we will ignore numpy warnings here; e.g. if trying # to use a non-numeric indexer try: with warnings.catch_warnings(record=True): target[assigner] = ret except (TypeError, IndexError): raise ValueError("Cannot assign expression output to target") if not resolvers: resolvers = ({assigner: ret}, ) else: # existing resolver needs updated to handle # case of mutating existing column in copy for resolver in resolvers: if assigner in resolver: resolver[assigner] = ret break else: resolvers += ({assigner: ret}, ) ret = None first_expr = False # We want to exclude `inplace=None` as being False. if inplace is False: return target if target_modified else ret