def __init__(self, mapping=None, data=None, environment=None): # Allow some sloppiness if (isinstance(mapping, pd.DataFrame) and (data is None or isinstance(data, aes))): mapping, data = data, mapping if mapping is None: mapping = aes() if (data is not None and not isinstance(data, pd.DataFrame)): raise PlotnineError( 'data must be a dataframe or None if each ' 'layer will have separate data.') # Recognize plydata groups if hasattr(data, 'group_indices') and 'group' not in mapping: mapping = mapping.copy() mapping['group'] = data.group_indices() self.data = data self.mapping = mapping self.facet = facet_null() self.labels = make_labels(mapping) self.layers = Layers() self.guides = guides() self.scales = Scales() self.theme = None self.coordinates = coord_cartesian() self.environment = environment or EvalEnvironment.capture(1) self.layout = None self.figure = None self.watermarks = [] self.axs = None
def _evaluate_expressions(self, data): """ Evaluates patsy expressions within the aesthetics. For example, 'x + 1' , 'factor(x)', or 'pd.cut(price, bins=10)') """ for key, item in self.data.items(): if item not in data: def factor(s, levels=None, labels=None): return s.apply(str) env = EvalEnvironment.capture( eval_env=(self.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np }) try: new_val = env.eval(item, inner_namespace=data) data[item] = new_val except: msg = "Invalid column: '%s'" % str(item) matches = difflib.get_close_matches(item, data.columns) msg += "\ndid you mean one of the following:\n" for match in matches: msg += " - %s\n" % match raise Exception(msg) return data
def test_formula_factor_origin(): from patsy.origin import Origin desc = ModelDesc.from_formula("a + b", EvalEnvironment.capture(0)) assert (desc.rhs_termlist[1].factors[0].origin == Origin("a + b", 0, 1)) assert (desc.rhs_termlist[2].factors[0].origin == Origin("a + b", 4, 5))
def dmatrix(formula_like, data={}, eval_env=0, return_type="matrix"): """Construct a single design matrix given a formula_like and data. :arg formula_like: An object that can be used to construct a design matrix. See below. :arg data: A dict-like object that can be used to look up variables referenced in `formula_like`. :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `formula_like` that cannot be found in `data`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`dmatrix` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below. The `formula_like` can take a variety of forms: * A formula string like "x1 + x2" (for :func:`dmatrix`) or "y ~ x1 + x2" (for :func:`dmatrices`). For details see :ref:`formulas`. * A :class:`ModelDesc`, which is a Python object representation of a formula. See :ref:`formulas` and :ref:`expert-model-specification` for details. * A :class:`DesignMatrixBuilder`. * An object that has a method called :meth:`__patsy_get_model_desc__`. For details see :ref:`expert-model-specification`. * A numpy array_like (for :func:`dmatrix`) or a tuple (array_like, array_like) (for :func:`dmatrices`). These will have metadata added, representation normalized, and then be returned directly. In this case `data` and `eval_env` are ignored. There is special handling for two cases: * :class:`DesignMatrix` objects will have their :class:`DesignInfo` preserved. This allows you to set up custom column names and term information even if you aren't using the rest of the patsy machinery. * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have their (row) indexes checked. If two are passed in, their indexes must be aligned. If ``return_type="dataframe"``, then their indexes will be preserved on the output. Regardless of the input, the return type is always either: * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default) * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``. The actual contents of the design matrix is identical in both cases, and in both cases a :class:`DesignInfo` will be available in a ``.design_info`` attribute on the return value. However, for ``return_type="dataframe"``, any pandas indexes on the input (either in `data` or directly passed through `formula_like`) will be preserved, which may be useful for e.g. time-series models. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, return_type) if lhs.shape[1] != 0: raise PatsyError("encountered outcome variables for a model " "that does not expect them") return rhs
def dmatrices(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): """Construct two design matrices given a formula_like and data. This function is identical to :func:`dmatrix`, except that it requires (and returns) two matrices instead of one. By convention, the first matrix is the "outcome" or "y" data, and the second is the "predictor" or "x" data. it requires the formula to specify both a left-hand side outcome matrix and a right-hand side predictors matrix, which are returned as a tuple. See :func:`dmatrix` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type) if lhs.shape[1] == 0: raise PatsyError("model is missing required outcome variables") return (lhs, rhs)
def __init__(self, mapping=None, data=None, environment=None): # Allow some sloppiness mapping, data = order_as_mapping_data(mapping, data) if mapping is None: mapping = aes() # Recognize plydata groups if hasattr(data, 'group_indices') and 'group' not in mapping: mapping = mapping.copy() mapping['group'] = data.group_indices() self.data = data self.mapping = mapping self.facet = facet_null() self.labels = make_labels(mapping) self.layers = Layers() self.guides = guides() self.scales = Scales() self.theme = theme_get() self.coordinates = coord_cartesian() self.environment = environment or EvalEnvironment.capture(1) self.layout = None self.figure = None self.watermarks = [] self.axs = None
def _evaluate_expressions(self, data): """ Evaluates patsy expressions within the aesthetics. For example, 'x + 1' , 'factor(x)', or 'pd.cut(price, bins=10)') """ for key, item in self.data.items(): if item not in data: def factor(s, levels=None, labels=None): return s.apply(str) env = EvalEnvironment.capture( eval_env=(self.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np }) try: new_val = env.eval(item, inner_namespace=data) data[item] = new_val except: pass return data
def __init__(self, mapping=None, data=None, environment=None): # Allow some sloppiness if (isinstance(mapping, pd.DataFrame) and (data is None or isinstance(data, aes))): mapping, data = data, mapping if mapping is None: mapping = aes() if (data is not None and not isinstance(data, pd.DataFrame)): raise PlotnineError('data must be a dataframe or None if each ' 'layer will have separate data.') # Recognize plydata groups if hasattr(data, 'group_indices') and 'group' not in mapping: mapping = mapping.copy() mapping['group'] = data.group_indices() self.data = data self.mapping = mapping self.facet = facet_null() self.labels = make_labels(mapping) self.layers = Layers() self.guides = guides() self.scales = Scales() self.theme = None self.coordinates = coord_cartesian() self.environment = environment or EvalEnvironment.capture(1) self.layout = None self.figure = None self.watermarks = [] self.axs = None
def test_ModelDesc_from_formula(): for input in ("y ~ x", parse_formula("y ~ x")): eval_env = EvalEnvironment.capture(0) md = ModelDesc.from_formula(input, eval_env) assert md.lhs_termlist == [ Term([EvalFactor("y", eval_env)]), ] assert md.rhs_termlist == [ INTERCEPT, Term([EvalFactor("x", eval_env)]) ]
def __init__(self, *args, **kwargs): if args: self.data = dict(zip(self.DEFAULT_ARGS, args)) else: self.data = {} if kwargs: self.data.update(kwargs) if 'colour' in self.data: self.data['color'] = self.data['colour'] del self.data['colour'] self.__eval_env__ = EvalEnvironment.capture(1)
def _apply_transforms(data, aes): """Adds columns from the aes included transformations Possible transformations are "factor(<col>)" and expressions which can be used with eval. Parameters ---------- data : DataFrame the original dataframe aes : aesthetics the aesthetic Returns ------- data : DateFrame Transformed DataFrame """ data = data.copy() for ae, name in aes.items(): if (isinstance(name, six.string_types) and (name not in data)): # here we assume that it is a transformation # if the mapping is to a single value (color="red"), this will be handled by pandas and # assigned to the whole index. See also the last case in mapping building in get_layer! from patsy.eval import EvalEnvironment def factor(s, levels=None, labels=None): # TODO: This factor implementation needs improvements... # probably only gonna happen after https://github.com/pydata/pandas/issues/5313 is # implemented in pandas ... if levels or labels: print("factor levels or labels are not yet implemented.") return s.apply(str) # use either the captured eval_env from aes or use the env one steps up env = EvalEnvironment.capture(eval_env=(aes.__eval_env__ or 1)) # add factor as a special case env.add_outer_namespace({"factor": factor}) try: new_val = env.eval(name, inner_namespace=data) except Exception as e: msg = "Could not evaluate the '%s' mapping: '%s' (original error: %s)" raise Exception(msg % (ae, name, str(e))) try: data[name] = new_val except Exception as e: msg = """The '%s' mapping: '%s' produced a value of type '%s', but only single items and lists/arrays can be used. (original error: %s)""" raise Exception(msg % (ae, name, str(type(_new_val)), str(e))) return data
def _do_eval_formula_tests(tests): # pragma: no cover for code, result in six.iteritems(tests): if len(result) == 2: result = (False, []) + result eval_env = EvalEnvironment.capture(0) model_desc = ModelDesc.from_formula(code, eval_env) print(repr(code)) print(result) print(model_desc) lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result _assert_terms_match(model_desc.lhs_termlist, lhs_intercept, lhs_termlist) _assert_terms_match(model_desc.rhs_termlist, rhs_intercept, rhs_termlist)
def _do_eval_formula_tests(tests): # pragma: no cover for code, result in six.iteritems(tests): if len(result) == 2: result = (False, []) + result eval_env = EvalEnvironment.capture(0) model_desc = ModelDesc.from_formula(code, eval_env) print(repr(code)) print(result) print(model_desc) lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result _assert_terms_match(model_desc.lhs_termlist, lhs_intercept, lhs_termlist, eval_env) _assert_terms_match(model_desc.rhs_termlist, rhs_intercept, rhs_termlist, eval_env)
def test_evalfactor_reraise(): # From issue #11: env = EvalEnvironment.capture() data = {"X" : [0,1,2,3], "Y" : [1,2,3,4]} formula = "C(X) + Y" new_data = {"X" : [0,0,1,2,3,3,4], "Y" : [1,2,3,4,5,6,7]} info = dmatrix(formula, data) # This will produce a PatsyError, which is originally raised within the # call to C() (which has no way to know where it is being called # from). But EvalFactor should notice this, and add a useful origin: try: build_design_matrices([info.design_info.builder], new_data) except PatsyError, e: assert e.origin == Origin(formula, 0, 4)
def __init__(self, *args, **kwargs): if args: self.data = dict(zip(self.DEFAULT_ARGS, args)) else: self.data = {} if kwargs: self.data.update(kwargs) if "colour" in self.data: self.data["color"] = self.data["colour"] del self.data["colour"] if "linetype" in self.data: self.data["linestyle"] = self.data["linetype"] del self.data["linetype"] self.__eval_env__ = EvalEnvironment.capture(1)
def incr_dbuilders(formula_like, data_iter_maker, eval_env=0): """Construct two design matrix builders incrementally from a large data set. :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is to :func:`dmatrix`. See :func:`incr_dbuilder` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) builders = _try_incr_builders(formula_like, data_iter_maker, eval_env) if builders is None: raise PatsyError("bad formula-like object") if len(builders[0].design_info.column_names) == 0: raise PatsyError("model is missing required outcome variables") return builders
def dmatrices(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): """Construct two design matrices given a formula_like and data. This function is identical to :func:`dmatrix`, except that it requires (and returns) two matrices instead of one. By convention, the first matrix is the "outcome" or "y" data, and the second is the "predictor" or "x" data. See :func:`dmatrix` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type) if lhs.shape[1] == 0: raise PatsyError("model is missing required outcome variables") return (lhs, rhs)
def _apply_transforms(data, aes): """Adds columns from the aes included transformations Possible transformations are "factor(<col>)" and expressions which can be used with eval. Parameters ---------- data : DataFrame the original dataframe aes : aesthetics the aesthetic Returns ------- data : DateFrame Transformed DataFrame """ data = data.copy() for ae, name in aes.items(): if (isinstance(name, six.string_types) and (name not in data)): # here we assume that it is a transformation # if the mapping is to a single value (color="red"), this will be handled by pandas and # assigned to the whole index. See also the last case in mapping building in get_layer! from patsy.eval import EvalEnvironment def factor(s, levels=None, labels=None): # TODO: This factor implementation needs improvements... # probably only gonna happen after https://github.com/pydata/pandas/issues/5313 is # implemented in pandas ... if levels or labels: print("factor levels or labels are not yet implemented.") return s.apply(str) # use either the captured eval_env from aes or use the env one steps up env = EvalEnvironment.capture(eval_env=(aes.__eval_env__ or 1)) # add factor as a special case env.add_outer_namespace({"factor":factor}) try: new_val = env.eval(name, inner_namespace=data) except Exception as e: msg = "Could not evaluate the '%s' mapping: '%s' (original error: %s)" raise Exception(msg % (ae, name, str(e))) try: data[name] = new_val except Exception as e: msg = """The '%s' mapping: '%s' produced a value of type '%s', but only single items and lists/arrays can be used. (original error: %s)""" raise Exception(msg % (ae, name, str(type(new_val)), str(e))) return data
def _evaluate_aes_expressions(self): """ Evaluates patsy expressions within the aesthetics. For example, 'x + 1' , 'factor(x)', or 'pd.cut(price, bins=10)') """ for key, item in self._aes.items(): if item not in self.data: def factor(s, levels=None, labels=None): return s.apply(str) env = EvalEnvironment.capture(eval_env=(self._aes.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np }) try: new_val = env.eval(item, inner_namespace=self.data) self.data[item] = new_val except: pass
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): """Construct a design matrix builder incrementally from a large data set. :arg formula_like: Similar to :func:`dmatrix`, except that explicit matrices are not allowed. Must be a formula string, a :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a ``__patsy_get_model_desc__`` method. :arg data_iter_maker: A zero-argument callable which returns an iterator over dict-like data objects. This must be a callable rather than a simple iterator because sufficiently complex formulas may require multiple passes over the data (e.g. if there are nested stateful transforms). :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `formula_like` that cannot be found in `data`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`incr_dbuilder` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. :returns: A :class:`DesignInfo` Tip: for `data_iter_maker`, write a generator like:: def iter_maker(): for data_chunk in my_data_store: yield data_chunk and pass `iter_maker` (*not* `iter_maker()`). .. versionadded:: 0.2.0 The ``NA_action`` argument. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action) if design_infos is None: raise PatsyError("bad formula-like object") if len(design_infos[0].column_names) > 0: raise PatsyError("encountered outcome variables for a model " "that does not expect them") return design_infos[1]
def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): """Construct two design matrix builders incrementally from a large data set. :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is to :func:`dmatrix`. See :func:`incr_dbuilder` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) builders = _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action) if builders is None: raise PatsyError("bad formula-like object") if len(builders[0].design_info.column_names) == 0: raise PatsyError("model is missing required outcome variables") return builders
def Q(name): """A way to 'quote' variable names, especially ones that do not otherwise meet Python's variable name rules. If ``x`` is a variable, ``Q("x")`` returns the value of ``x``. (Note that ``Q`` takes the *string* ``"x"``, not the value of ``x`` itself.) This works even if instead of ``x``, we have a variable name that would not otherwise be legal in Python. For example, if you have a column of data named `weight.in.kg`, then you can't write:: y ~ weight.in.kg because Python will try to find a variable named ``weight``, that has an attribute named ``in``, that has an attribute named ``kg``. (And worse yet, ``in`` is a reserved word, which makes this example doubly broken.) Instead, write:: y ~ Q("weight.in.kg") and all will be well. Note, though, that this requires embedding a Python string inside your formula, which may require some care with your quote marks. Some standard options include:: my_fit_function("y ~ Q('weight.in.kg')", ...) my_fit_function('y ~ Q("weight.in.kg")', ...) my_fit_function("y ~ Q(\\"weight.in.kg\\")", ...) Note also that ``Q`` is an ordinary Python function, which means that you can use it in more complex expressions. For example, this is a legal formula:: y ~ np.sqrt(Q("weight.in.kg")) """ from patsy.eval import EvalEnvironment env = EvalEnvironment.capture(1) try: return env.namespace[name] except KeyError: raise NameError, "no data named %r found" % (name,)
def Q(name): """A way to 'quote' variable names, especially ones that do not otherwise meet Python's variable name rules. If ``x`` is a variable, ``Q("x")`` returns the value of ``x``. (Note that ``Q`` takes the *string* ``"x"``, not the value of ``x`` itself.) This works even if instead of ``x``, we have a variable name that would not otherwise be legal in Python. For example, if you have a column of data named `weight.in.kg`, then you can't write:: y ~ weight.in.kg because Python will try to find a variable named ``weight``, that has an attribute named ``in``, that has an attribute named ``kg``. (And worse yet, ``in`` is a reserved word, which makes this example doubly broken.) Instead, write:: y ~ Q("weight.in.kg") and all will be well. Note, though, that this requires embedding a Python string inside your formula, which may require some care with your quote marks. Some standard options include:: my_fit_function("y ~ Q('weight.in.kg')", ...) my_fit_function('y ~ Q("weight.in.kg")', ...) my_fit_function("y ~ Q(\\"weight.in.kg\\")", ...) Note also that ``Q`` is an ordinary Python function, which means that you can use it in more complex expressions. For example, this is a legal formula:: y ~ np.sqrt(Q("weight.in.kg")) """ from patsy.eval import EvalEnvironment env = EvalEnvironment.capture(1) try: return env.namespace[name] except KeyError: raise NameError, "no data named %r found" % (name, )
def _evaluate_expressions(self, data): """ Evaluates patsy expressions within the aesthetics. For example, 'x + 1' , 'factor(x)', or 'pd.cut(price, bins=10)') """ for key, item in self.data.items(): if item not in data: def factor(s, levels=None, labels=None): return s.apply(str) env = EvalEnvironment.capture(eval_env=(self.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np }) try: new_val = env.eval(item, inner_namespace=data) data[item] = new_val except: msg = "Invalid column: '%s'" % str(item) matches = difflib.get_close_matches(item, data.columns) msg += "\ndid you mean one of the following:\n" for match in matches: msg += " - %s\n" % match raise Exception(msg) return data
def __init__(self, mapping=None, data=None, environment=None): # Allow some sloppiness if not isinstance(mapping, aes): mapping, data = data, mapping if mapping is None: mapping = aes() if (data is not None and not isinstance(data, pd.DataFrame)): raise PlotnineError('data must be a dataframe or None if each ' 'layer will have separate data.') self.data = data self.mapping = mapping self.facet = facet_null() self.labels = make_labels(mapping) self.layers = Layers() self.guides = guides() self.scales = Scales() self.theme = None self.coordinates = coord_cartesian() self.environment = environment or EvalEnvironment.capture(1) self.layout = None self.figure = None
def test_ModelDesc_from_formula(): for input in ("y ~ x", parse_formula("y ~ x")): eval_env = EvalEnvironment.capture(0) md = ModelDesc.from_formula(input, eval_env) assert md.lhs_termlist == [Term([EvalFactor("y")]),] assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])]
def subset(self, which_terms): """Create a new :class:`DesignMatrixBuilder` that includes only a subset of the terms that this object does. For example, if `builder` has terms `x`, `y`, and `z`, then:: builder2 = builder.subset(["x", "z"]) will return a new builder that will return design matrices with only the columns corresponding to the terms `x` and `z`. After we do this, then in general these two expressions will return the same thing (here we assume that `x`, `y`, and `z` each generate a single column of the output):: build_design_matrix([builder], data)[0][:, [0, 2]] build_design_matrix([builder2], data)[0] However, a critical difference is that in the second case, `data` need not contain any values for `y`. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for `y`. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use `0` or `-1` in your formula if you want to avoid this. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 """ factor_to_evaluators = {} for evaluator in self._evaluators: factor_to_evaluators[evaluator.factor] = evaluator design_info = self.design_info term_name_to_term = dict(zip(design_info.term_names, design_info.terms)) if isinstance(which_terms, basestring): # We don't use this EvalEnvironment -- all we want to do is to # find matching terms, and we can't do that use == on Term # objects, because that calls == on factor objects, which in turn # compares EvalEnvironments. So all we do with the parsed formula # is pull out the term *names*, which the EvalEnvironment doesn't # effect. This is just a placeholder then to allow the ModelDesc # to be created: env = EvalEnvironment({}) desc = ModelDesc.from_formula(which_terms, env) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] terms = [] evaluators = set() term_to_column_builders = {} for term_or_name in which_terms: if isinstance(term_or_name, basestring): if term_or_name not in term_name_to_term: raise PatsyError("requested term %r not found in " "this DesignMatrixBuilder" % (term_or_name, )) term = term_name_to_term[term_or_name] else: term = term_or_name if term not in self._termlist: raise PatsyError("requested term '%s' not found in this " "DesignMatrixBuilder" % (term, )) for factor in term.factors: evaluators.add(factor_to_evaluators[factor]) terms.append(term) column_builder = self._term_to_column_builders[term] term_to_column_builders[term] = column_builder return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
def compute_aesthetics(self, plot): """ Return a dataframe where the columns match the aesthetic mappings. Transformations like 'factor(cyl)' and other expression evaluation are made in here """ data = self.data aesthetics = self.layer_mapping(plot.mapping) # Override grouping if set in layer. with suppress(KeyError): aesthetics['group'] = self.geom.aes_params['group'] env = EvalEnvironment.capture(eval_env=plot.environment) env = env.with_outer_namespace({'factor': pd.Categorical}) # Using `type` preserves the subclass of pd.DataFrame evaled = type(data)(index=data.index) # If a column name is not in the data, it is evaluated/transformed # in the environment of the call to ggplot for ae, col in aesthetics.items(): if isinstance(col, six.string_types): if col in data: evaled[ae] = data[col] else: try: new_val = env.eval(col, inner_namespace=data) except Exception as e: raise PlotnineError( _TPL_EVAL_FAIL.format(ae, col, str(e))) try: evaled[ae] = new_val except Exception as e: raise PlotnineError( _TPL_BAD_EVAL_TYPE.format( ae, col, str(type(new_val)), str(e))) elif pdtypes.is_list_like(col): n = len(col) if len(data) and n != len(data) and n != 1: raise PlotnineError( "Aesthetics must either be length one, " + "or the same length as the data") # An empty dataframe does not admit a scalar value elif len(evaled) and n == 1: col = col[0] evaled[ae] = col elif is_known_scalar(col): if not len(evaled): col = [col] evaled[ae] = col else: msg = "Do not know how to deal with aesthetic '{}'" raise PlotnineError(msg.format(ae)) evaled_aes = aes(**dict((col, col) for col in evaled)) plot.scales.add_defaults(evaled, evaled_aes) if len(data) == 0 and len(evaled) > 0: # No data, and vectors suppled to aesthetics evaled['PANEL'] = 1 else: evaled['PANEL'] = data['PANEL'] self.data = add_group(evaled)
def test_eval_formula_error_reporting(): from patsy.parse_formula import _parsing_error_test parse_fn = lambda formula: ModelDesc.from_formula(formula, EvalEnvironment.capture(0)) _parsing_error_test(parse_fn, _eval_error_tests)
def test_formula_likes(): # Plain array-like, rhs only t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"]) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) # tuples must have the right size t_invalid(([[1, 2, 3]], ), {}, 0) t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0) # plain Series and DataFrames if have_pandas: # Names are extracted t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]], ["asdf"]) t((pandas.DataFrame({"y": [4, 5, 6] }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) t((pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) # Or invented t((pandas.DataFrame([[4, 5, 6]]), pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False, [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"]) t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match t_invalid((pandas.DataFrame( [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) foreign_model = ForeignModelSource() t(foreign_model, { "Y": [1, 2], "X": [[1, 2], [3, 4]] }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"]) class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data t_invalid(BadForeignModelSource(), {}, 0) # string formulas t("y ~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) t("~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) t("x + y", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) t(desc, { "x": [1.5, 2.5, 3.5], "y": [10, 20, 30] }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], [[10], [20], [30]], ["y"]) # builders termlists = ( [], [Term([LookupFactor("x")])], [Term([]), Term([LookupFactor("x")])], ) builders = design_matrix_builders(termlists, lambda: iter([{ "x": [1, 2, 3] }])) # twople but with no LHS t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # single DesignMatrixBuilder t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # twople with LHS t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]], ["x"]) # check depth arguments x_in_env = [1, 2, 3] t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) def check_nested_call_2(): x_in_env = "asdf" t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call_2()
def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"): """Construct several :class:`DesignMatrixBuilders` from termlists. This is one of Patsy's fundamental functions. This function and :func:`build_design_matrices` together form the API to the core formula interpretation machinery. :arg termlists: A list of termlists, where each termlist is a list of :class:`Term` objects which together specify a design matrix. :arg data_iter_maker: A zero-argument callable which returns an iterator over dict-like data objects. This must be a callable rather than a simple iterator because sufficiently complex formulas may require multiple passes over the data (e.g. if there are nested stateful transforms). :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `termlists` that cannot be found in `data_iter_maker`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`design_matrix_builders` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. :returns: A list of :class:`DesignMatrixBuilder` objects, one for each termlist passed in. This function performs zero or more iterations over the data in order to sniff out any necessary information about factor types, set up stateful transforms, pick column names, etc. See :ref:`formulas` for details. .. versionadded:: 0.2.0 The ``NA_action`` argument. .. versionadded:: 0.4.0 The ``eval_env`` argument. """ # People upgrading from versions prior to 0.4.0 could potentially have # passed NA_action as the 3rd positional argument. Fortunately # EvalEnvironment.capture only accepts int and EvalEnvironment objects, # and we improved its error messages to make this clear. eval_env = EvalEnvironment.capture(eval_env, reference=1) if isinstance(NA_action, str): NA_action = NAAction(NA_action) all_factors = set() for termlist in termlists: for term in termlist: all_factors.update(term.factors) factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) # Now all the factors have working eval methods, so we can evaluate them # on some data to find out what type of data they return. (num_column_counts, cat_levels_contrasts) = _examine_factor_types(all_factors, factor_states, data_iter_maker, NA_action) # Now we need the factor evaluators, which encapsulate the knowledge of # how to turn any given factor into a chunk of data: factor_evaluators = {} for factor in all_factors: if factor in num_column_counts: evaluator = _NumFactorEvaluator(factor, factor_states[factor], num_column_counts[factor]) else: assert factor in cat_levels_contrasts levels = cat_levels_contrasts[factor][0] evaluator = _CatFactorEvaluator(factor, factor_states[factor], levels) factor_evaluators[factor] = evaluator # And now we can construct the DesignMatrixBuilder for each termlist: builders = [] for termlist in termlists: result = _make_term_column_builders(termlist, num_column_counts, cat_levels_contrasts) new_term_order, term_to_column_builders = result assert frozenset(new_term_order) == frozenset(termlist) term_evaluators = set() for term in termlist: for factor in term.factors: term_evaluators.add(factor_evaluators[factor]) builders.append(DesignMatrixBuilder(new_term_order, term_evaluators, term_to_column_builders)) return builders
def qplot(x=None, y=None, data=None, facets=None, margins=False, geom='auto', xlim=None, ylim=None, log='', main=None, xlab=None, ylab=None, asp=None, **kwargs): """ Quick plot Parameters ---------- x : str | array_like x aesthetic y : str | array_like y aesthetic data : dataframe Data frame to use (optional). If not specified, will create one, extracting arrays from the current environment. geom : str | list *geom(s)* to do the drawing. If ``auto``, defaults to 'point' if ``x`` and ``y`` are specified or 'histogram' if only ``x`` is specified. xlim : tuple x-axis limits ylim : tuple y-axis limits log : str in ``{'x', 'y', 'xy'}`` Which variables to log transform. main : str Plot title xlab : str x-axis label ylab : str y-axis label asp : str | float The y/x aspect ratio. **kwargs : dict Arguments passed on to the geom. Returns ------- p: ggplot ggplot object """ # Extract all recognizable aesthetic mappings from the parameters # String values e.g "I('red')", "I(4)" are not treated as mappings environment = EvalEnvironment.capture(1) aesthetics = {} if x is None else {'x': x} if y is not None: aesthetics['y'] = y def is_mapping(value): """ Return True if value is not enclosed in I() function """ with suppress(AttributeError): return not (value.startswith('I(') and value.endswith(')')) return True def I(value): return value I_env = EvalEnvironment([{'I': I}]) for ae in six.viewkeys(kwargs) & all_aesthetics: value = kwargs[ae] if is_mapping(value): aesthetics[ae] = value else: kwargs[ae] = I_env.eval(value) # List of geoms if is_string(geom): geom = [geom] elif isinstance(geom, tuple): geom = list(geom) if data is None: data = pd.DataFrame() # Work out plot data, and modify aesthetics, if necessary def replace_auto(lst, str2): """ Replace all occurences of 'auto' in with str2 """ for i, value in enumerate(lst): if value == 'auto': lst[i] = str2 return lst if 'auto' in geom: if 'sample' in aesthetics: replace_auto(geom, 'qq') elif y is None: # If x is discrete we choose geom_bar & # geom_histogram otherwise. But we need to # evaluate the mapping to find out the dtype env = environment.with_outer_namespace( {'factor': pd.Categorical}) if isinstance(aesthetics['x'], six.string_types): try: x = env.eval(aesthetics['x'], inner_namespace=data) except Exception: msg = "Could not evaluate aesthetic 'x={}'" raise PlotnineError(msg.format(aesthetics['x'])) elif not hasattr(aesthetics['x'], 'dtype'): x = np.asarray(aesthetics['x']) if array_kind.discrete(x): replace_auto(geom, 'bar') else: replace_auto(geom, 'histogram') else: if x is None: if pdtypes.is_list_like(aesthetics['y']): aesthetics['x'] = range(len(aesthetics['y'])) xlab = 'range(len(y))' ylab = 'y' else: # We could solve the issue in layer.compute_asthetics # but it is not worth the extra complexity raise PlotnineError( "Cannot infer how long x should be.") replace_auto(geom, 'point') p = ggplot(aes(**aesthetics), data=data, environment=environment) def get_facet_type(facets): with suppress(PlotnineError): parse_grid_facets(facets) return 'grid' with suppress(PlotnineError): parse_wrap_facets(facets) return 'wrap' warn("Could not determine the type of faceting, " "therefore no faceting.") return 'null' if facets: facet_type = get_facet_type(facets) if facet_type == 'grid': p += facet_grid(facets, margins=margins) elif facet_type == 'wrap': p += facet_wrap(facets) else: p += facet_null() # Add geoms for g in geom: geom_name = 'geom_{}'.format(g) geom_klass = Registry[geom_name] stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat']) stat_klass = Registry[stat_name] # find params recognized = (six.viewkeys(kwargs) & (six.viewkeys(geom_klass.DEFAULT_PARAMS) | geom_klass.aesthetics() | six.viewkeys(stat_klass.DEFAULT_PARAMS) | stat_klass.aesthetics())) recognized = recognized - six.viewkeys(aesthetics) params = {ae: kwargs[ae] for ae in recognized} p += geom_klass(**params) # pd.Series objects have name attributes. In a dataframe, the # series have the name of the column. labels = {} for ae in scaled_aesthetics & six.viewkeys(kwargs): with suppress(AttributeError): labels[ae] = kwargs[ae].name with suppress(AttributeError): labels['x'] = xlab if xlab is not None else x.name with suppress(AttributeError): labels['y'] = ylab if ylab is not None else y.name if main is not None: labels['title'] = main if 'x' in log: p += scale_x_log10() if 'y' in log: p += scale_y_log10() if labels: p += labs(**labels) if asp: p += theme(aspect_ratio=asp) return p
def _get_env(eval_env): if isinstance(eval_env, int): # Here eval_env=0 refers to our caller's caller. return EvalEnvironment.capture(eval_env + 2) return eval_env
def qplot(x=None, y=None, data=None, facets=None, margins=False, geom='auto', xlim=None, ylim=None, log='', main=None, xlab=None, ylab=None, asp=None, **kwargs): """ Quick plot Parameters ---------- x : str | array_like x aesthetic y : str | array_like y aesthetic data : dataframe Data frame to use (optional). If not specified, will create one, extracting arrays from the current environment. geom : str | list *geom(s)* to do the drawing. If ``auto``, defaults to 'point' if ``x`` and ``y`` are specified or 'histogram' if only ``x`` is specified. xlim : tuple x-axis limits ylim : tuple y-axis limits log : str in ``{'x', 'y', 'xy'}`` Which variables to log transform. main : str Plot title xlab : str x-axis label ylab : str y-axis label asp : str | float The y/x aspect ratio. **kwargs : dict Arguments passed on to the geom. Returns ------- p: ggplot ggplot object """ # Extract all recognizable aesthetic mappings from the parameters # String values e.g "I('red')", "I(4)" are not treated as mappings environment = EvalEnvironment.capture(1) aesthetics = {} if x is None else {'x': x} if y is not None: aesthetics['y'] = y def is_mapping(value): """ Return True if value is not enclosed in I() function """ with suppress(AttributeError): return not (value.startswith('I(') and value.endswith(')')) return True def I(value): return value I_env = EvalEnvironment([{'I': I}]) for ae in kwargs.keys() & all_aesthetics: value = kwargs[ae] if is_mapping(value): aesthetics[ae] = value else: kwargs[ae] = I_env.eval(value) # List of geoms if is_string(geom): geom = [geom] elif isinstance(geom, tuple): geom = list(geom) if data is None: data = pd.DataFrame() # Work out plot data, and modify aesthetics, if necessary def replace_auto(lst, str2): """ Replace all occurences of 'auto' in with str2 """ for i, value in enumerate(lst): if value == 'auto': lst[i] = str2 return lst if 'auto' in geom: if 'sample' in aesthetics: replace_auto(geom, 'qq') elif y is None: # If x is discrete we choose geom_bar & # geom_histogram otherwise. But we need to # evaluate the mapping to find out the dtype env = environment.with_outer_namespace({'factor': pd.Categorical}) if isinstance(aesthetics['x'], str): try: x = env.eval(aesthetics['x'], inner_namespace=data) except Exception: msg = "Could not evaluate aesthetic 'x={}'" raise PlotnineError(msg.format(aesthetics['x'])) elif not hasattr(aesthetics['x'], 'dtype'): x = np.asarray(aesthetics['x']) if array_kind.discrete(x): replace_auto(geom, 'bar') else: replace_auto(geom, 'histogram') else: if x is None: if pdtypes.is_list_like(aesthetics['y']): aesthetics['x'] = range(len(aesthetics['y'])) xlab = 'range(len(y))' ylab = 'y' else: # We could solve the issue in layer.compute_asthetics # but it is not worth the extra complexity raise PlotnineError("Cannot infer how long x should be.") replace_auto(geom, 'point') p = ggplot(aes(**aesthetics), data=data, environment=environment) def get_facet_type(facets): with suppress(PlotnineError): parse_grid_facets(facets) return 'grid' with suppress(PlotnineError): parse_wrap_facets(facets) return 'wrap' warn( "Could not determine the type of faceting, " "therefore no faceting.", PlotnineWarning) return 'null' if facets: facet_type = get_facet_type(facets) if facet_type == 'grid': p += facet_grid(facets, margins=margins) elif facet_type == 'wrap': p += facet_wrap(facets) else: p += facet_null() # Add geoms for g in geom: geom_name = 'geom_{}'.format(g) geom_klass = Registry[geom_name] stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat']) stat_klass = Registry[stat_name] # find params recognized = ( kwargs.keys() & (geom_klass.DEFAULT_PARAMS.keys() | geom_klass.aesthetics() | stat_klass.DEFAULT_PARAMS.keys() | stat_klass.aesthetics())) recognized = recognized - aesthetics.keys() params = {ae: kwargs[ae] for ae in recognized} p += geom_klass(**params) # pd.Series objects have name attributes. In a dataframe, the # series have the name of the column. labels = {} for ae in scaled_aesthetics & kwargs.keys(): with suppress(AttributeError): labels[ae] = kwargs[ae].name with suppress(AttributeError): labels['x'] = xlab if xlab is not None else x.name with suppress(AttributeError): labels['y'] = ylab if ylab is not None else y.name if main is not None: labels['title'] = main if 'x' in log: p += scale_x_log10() if 'y' in log: p += scale_y_log10() if labels: p += labs(**labels) if asp: p += theme(aspect_ratio=asp) return p
def compute_aesthetics(self, plot): """ Return a dataframe where the columns match the aesthetic mappings. Transformations like 'factor(cyl)' and other expression evaluation are made in here """ data = self.data aesthetics = self.layer_mapping(plot.mapping) # Override grouping if set in layer. with suppress(KeyError): aesthetics['group'] = self.geom.aes_params['group'] env = EvalEnvironment.capture(eval_env=plot.environment) env = env.with_outer_namespace({'factor': pd.Categorical}) # Using `type` preserves the subclass of pd.DataFrame evaled = type(data)(index=data.index) # If a column name is not in the data, it is evaluated/transformed # in the environment of the call to ggplot for ae, col in aesthetics.items(): if isinstance(col, str): if col in data: evaled[ae] = data[col] else: try: new_val = env.eval(col, inner_namespace=data) except Exception as e: raise PlotnineError( _TPL_EVAL_FAIL.format(ae, col, str(e))) try: evaled[ae] = new_val except Exception as e: raise PlotnineError( _TPL_BAD_EVAL_TYPE.format( ae, col, str(type(new_val)), str(e))) elif pdtypes.is_list_like(col): n = len(col) if len(data) and n != len(data) and n != 1: raise PlotnineError( "Aesthetics must either be length one, " + "or the same length as the data") # An empty dataframe does not admit a scalar value elif len(evaled) and n == 1: col = col[0] evaled[ae] = col elif is_known_scalar(col): if not len(evaled): col = [col] evaled[ae] = col else: msg = "Do not know how to deal with aesthetic '{}'" raise PlotnineError(msg.format(ae)) evaled_aes = aes(**dict((col, col) for col in evaled)) plot.scales.add_defaults(evaled, evaled_aes) if len(data) == 0 and len(evaled) > 0: # No data, and vectors suppled to aesthetics evaled['PANEL'] = 1 else: evaled['PANEL'] = data['PANEL'] self.data = add_group(evaled)
def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"): """Construct several :class:`DesignInfo` objects from termlists. This is one of Patsy's fundamental functions. This function and :func:`build_design_matrices` together form the API to the core formula interpretation machinery. :arg termlists: A list of termlists, where each termlist is a list of :class:`Term` objects which together specify a design matrix. :arg data_iter_maker: A zero-argument callable which returns an iterator over dict-like data objects. This must be a callable rather than a simple iterator because sufficiently complex formulas may require multiple passes over the data (e.g. if there are nested stateful transforms). :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `termlists` that cannot be found in `data_iter_maker`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`design_matrix_builders` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. :returns: A list of :class:`DesignInfo` objects, one for each termlist passed in. This function performs zero or more iterations over the data in order to sniff out any necessary information about factor types, set up stateful transforms, pick column names, etc. See :ref:`formulas` for details. .. versionadded:: 0.2.0 The ``NA_action`` argument. .. versionadded:: 0.4.0 The ``eval_env`` argument. """ # People upgrading from versions prior to 0.4.0 could potentially have # passed NA_action as the 3rd positional argument. Fortunately # EvalEnvironment.capture only accepts int and EvalEnvironment objects, # and we improved its error messages to make this clear. eval_env = EvalEnvironment.capture(eval_env, reference=1) if isinstance(NA_action, str): NA_action = NAAction(NA_action) all_factors = set() for termlist in termlists: for term in termlist: all_factors.update(term.factors) factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) # Now all the factors have working eval methods, so we can evaluate them # on some data to find out what type of data they return. (num_column_counts, cat_levels_contrasts) = _examine_factor_types(all_factors, factor_states, data_iter_maker, NA_action) # Now we need the factor infos, which encapsulate the knowledge of # how to turn any given factor into a chunk of data: factor_infos = {} for factor in all_factors: if factor in num_column_counts: fi = FactorInfo(factor, "numerical", factor_states[factor], num_columns=num_column_counts[factor], categories=None) else: assert factor in cat_levels_contrasts categories = cat_levels_contrasts[factor][0] fi = FactorInfo(factor, "categorical", factor_states[factor], num_columns=None, categories=categories) factor_infos[factor] = fi # And now we can construct the DesignInfo for each termlist: design_infos = [] for termlist in termlists: term_to_subterm_infos = _make_subterm_infos(termlist, num_column_counts, cat_levels_contrasts) assert isinstance(term_to_subterm_infos, OrderedDict) assert frozenset(term_to_subterm_infos) == frozenset(termlist) this_design_factor_infos = {} for term in termlist: for factor in term.factors: this_design_factor_infos[factor] = factor_infos[factor] column_names = [] for subterms in six.itervalues(term_to_subterm_infos): for subterm in subterms: for column_name in _subterm_column_names_iter( factor_infos, subterm): column_names.append(column_name) design_infos.append(DesignInfo(column_names, factor_infos=this_design_factor_infos, term_codings=term_to_subterm_infos)) return design_infos
def test_formula_likes(): # Plain array-like, rhs only t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"]) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) # tuples must have the right size t_invalid(([[1, 2, 3]],), {}, 0) t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0) # plain Series and DataFrames if have_pandas: # Names are extracted t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]], ["asdf"]) t((pandas.DataFrame({"y": [4, 5, 6]}), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) t((pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) # Or invented t((pandas.DataFrame([[4, 5, 6]]), pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False, [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"]) t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match t_invalid((pandas.DataFrame([[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) foreign_model = ForeignModelSource() t(foreign_model, {"Y": [1, 2], "X": [[1, 2], [3, 4]]}, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"]) class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data t_invalid(BadForeignModelSource(), {}, 0) # string formulas t("y ~ x", {"y": [1, 2], "x": [3, 4]}, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) t("~ x", {"y": [1, 2], "x": [3, 4]}, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) t("x + y", {"y": [1, 2], "x": [3, 4]}, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5], "y": [10, 20, 30]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], [[10], [20], [30]], ["y"]) # builders termlists = ([], [Term([LookupFactor("x")])], [Term([]), Term([LookupFactor("x")])], ) builders = design_matrix_builders(termlists, lambda: iter([{"x": [1, 2, 3]}])) # twople but with no LHS t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # single DesignMatrixBuilder t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # twople with LHS t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]], ["x"]) # check depth arguments x_in_env = [1, 2, 3] t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) def check_nested_call_2(): x_in_env = "asdf" t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call_2()
def dmatrix(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): """Construct a single design matrix given a formula_like and data. :arg formula_like: An object that can be used to construct a design matrix. See below. :arg data: A dict-like object that can be used to look up variables referenced in `formula_like`. :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `formula_like` that cannot be found in `data`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`dmatrix` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg NA_action: What to do with rows that contain missing values. You can ``"drop"`` them, ``"raise"`` an error, or for customization, pass an :class:`NAAction` object. See :class:`NAAction` for details on what values count as 'missing' (and how to alter this). :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below. The `formula_like` can take a variety of forms: * A formula string like "x1 + x2" (for :func:`dmatrix`) or "y ~ x1 + x2" (for :func:`dmatrices`). For details see :ref:`formulas`. * A :class:`ModelDesc`, which is a Python object representation of a formula. See :ref:`formulas` and :ref:`expert-model-specification` for details. * A :class:`DesignMatrixBuilder`. * An object that has a method called :meth:`__patsy_get_model_desc__`. For details see :ref:`expert-model-specification`. * A numpy array_like (for :func:`dmatrix`) or a tuple (array_like, array_like) (for :func:`dmatrices`). These will have metadata added, representation normalized, and then be returned directly. In this case `data` and `eval_env` are ignored. There is special handling for two cases: * :class:`DesignMatrix` objects will have their :class:`DesignInfo` preserved. This allows you to set up custom column names and term information even if you aren't using the rest of the patsy machinery. * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have their (row) indexes checked. If two are passed in, their indexes must be aligned. If ``return_type="dataframe"``, then their indexes will be preserved on the output. Regardless of the input, the return type is always either: * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default) * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``. The actual contents of the design matrix is identical in both cases, and in both cases a :class:`DesignInfo` will be available in a ``.design_info`` attribute on the return value. However, for ``return_type="dataframe"``, any pandas indexes on the input (either in `data` or directly passed through `formula_like`) will be preserved, which may be useful for e.g. time-series models. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type) if lhs.shape[1] != 0: raise PatsyError("encountered outcome variables for a model " "that does not expect them") return rhs