Example #1
0
    def __init__(self, mapping=None, data=None, environment=None):
        # Allow some sloppiness
        if (isinstance(mapping, pd.DataFrame) and
                (data is None or isinstance(data, aes))):
            mapping, data = data, mapping
        if mapping is None:
            mapping = aes()

        if (data is not None and
                not isinstance(data, pd.DataFrame)):
            raise PlotnineError(
                'data must be a dataframe or None if each '
                'layer will have separate data.')

        # Recognize plydata groups
        if hasattr(data, 'group_indices') and 'group' not in mapping:
            mapping = mapping.copy()
            mapping['group'] = data.group_indices()

        self.data = data
        self.mapping = mapping
        self.facet = facet_null()
        self.labels = make_labels(mapping)
        self.layers = Layers()
        self.guides = guides()
        self.scales = Scales()
        self.theme = None
        self.coordinates = coord_cartesian()
        self.environment = environment or EvalEnvironment.capture(1)
        self.layout = None
        self.figure = None
        self.watermarks = []
        self.axs = None
Example #2
0
    def _evaluate_expressions(self, data):
        """
        Evaluates patsy expressions within the aesthetics. For example, 'x + 1'
        , 'factor(x)', or 'pd.cut(price, bins=10)')
        """
        for key, item in self.data.items():
            if item not in data:

                def factor(s, levels=None, labels=None):
                    return s.apply(str)

                env = EvalEnvironment.capture(
                    eval_env=(self.__eval_env__ or 1)).with_outer_namespace({
                        "factor":
                        factor,
                        "pd":
                        pd,
                        "np":
                        np
                    })
                try:
                    new_val = env.eval(item, inner_namespace=data)
                    data[item] = new_val
                except:
                    msg = "Invalid column: '%s'" % str(item)
                    matches = difflib.get_close_matches(item, data.columns)
                    msg += "\ndid you mean one of the following:\n"
                    for match in matches:
                        msg += "    - %s\n" % match
                    raise Exception(msg)
        return data
Example #3
0
def test_formula_factor_origin():
    from patsy.origin import Origin
    desc = ModelDesc.from_formula("a + b", EvalEnvironment.capture(0))
    assert (desc.rhs_termlist[1].factors[0].origin
            == Origin("a + b", 0, 1))
    assert (desc.rhs_termlist[2].factors[0].origin
            == Origin("a + b", 4, 5))
Example #4
0
def dmatrix(formula_like, data={}, eval_env=0, return_type="matrix"):
    """Construct a single design matrix given a formula_like and data.

    :arg formula_like: An object that can be used to construct a design
      matrix. See below.
    :arg data: A dict-like object that can be used to look up variables
      referenced in `formula_like`.
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `formula_like` that cannot be
      found in `data`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`dmatrix` for lookups. If calling this function from a library,
      you probably want ``eval_env=1``, which means that variables should be
      resolved in *your* caller's namespace.
    :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.

    The `formula_like` can take a variety of forms:

    * A formula string like "x1 + x2" (for :func:`dmatrix`) or "y ~ x1 + x2"
      (for :func:`dmatrices`). For details see :ref:`formulas`.
    * A :class:`ModelDesc`, which is a Python object representation of a
      formula. See :ref:`formulas` and :ref:`expert-model-specification` for
      details.
    * A :class:`DesignMatrixBuilder`.
    * An object that has a method called :meth:`__patsy_get_model_desc__`.
      For details see :ref:`expert-model-specification`.
    * A numpy array_like (for :func:`dmatrix`) or a tuple
      (array_like, array_like) (for :func:`dmatrices`). These will have
      metadata added, representation normalized, and then be returned
      directly. In this case `data` and `eval_env` are
      ignored. There is special handling for two cases:

      * :class:`DesignMatrix` objects will have their :class:`DesignInfo`
        preserved. This allows you to set up custom column names and term
        information even if you aren't using the rest of the patsy
        machinery.
      * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have
        their (row) indexes checked. If two are passed in, their indexes must
        be aligned. If ``return_type="dataframe"``, then their indexes will be
        preserved on the output.
      
    Regardless of the input, the return type is always either:

    * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default)
    * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``.

    The actual contents of the design matrix is identical in both cases, and
    in both cases a :class:`DesignInfo` will be available in a
    ``.design_info`` attribute on the return value. However, for
    ``return_type="dataframe"``, any pandas indexes on the input (either in
    `data` or directly passed through `formula_like`) will be
    preserved, which may be useful for e.g. time-series models.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, return_type)
    if lhs.shape[1] != 0:
        raise PatsyError("encountered outcome variables for a model "
                            "that does not expect them")
    return rhs
Example #5
0
def dmatrices(formula_like,
              data={},
              eval_env=0,
              NA_action="drop",
              return_type="matrix"):
    """Construct two design matrices given a formula_like and data.

    This function is identical to :func:`dmatrix`, except that it requires
    (and returns) two matrices instead of one. By convention, the first matrix
    is the "outcome" or "y" data, and the second is the "predictor" or "x"
    data.

    
    it requires the
    formula to specify both a left-hand side outcome matrix and a right-hand
    side predictors matrix, which are returned as a tuple.

    See :func:`dmatrix` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action,
                                      return_type)
    if lhs.shape[1] == 0:
        raise PatsyError("model is missing required outcome variables")
    return (lhs, rhs)
Example #6
0
File: desc.py Project: noelhx/patsy
def test_formula_factor_origin():
    from patsy.origin import Origin
    desc = ModelDesc.from_formula("a + b", EvalEnvironment.capture(0))
    assert (desc.rhs_termlist[1].factors[0].origin
            == Origin("a + b", 0, 1))
    assert (desc.rhs_termlist[2].factors[0].origin
            == Origin("a + b", 4, 5))
Example #7
0
    def __init__(self, mapping=None, data=None, environment=None):
        # Allow some sloppiness
        mapping, data = order_as_mapping_data(mapping, data)
        if mapping is None:
            mapping = aes()

        # Recognize plydata groups
        if hasattr(data, 'group_indices') and 'group' not in mapping:
            mapping = mapping.copy()
            mapping['group'] = data.group_indices()

        self.data = data
        self.mapping = mapping
        self.facet = facet_null()
        self.labels = make_labels(mapping)
        self.layers = Layers()
        self.guides = guides()
        self.scales = Scales()
        self.theme = theme_get()
        self.coordinates = coord_cartesian()
        self.environment = environment or EvalEnvironment.capture(1)
        self.layout = None
        self.figure = None
        self.watermarks = []
        self.axs = None
Example #8
0
    def _evaluate_expressions(self, data):
        """
        Evaluates patsy expressions within the aesthetics. For example, 'x + 1'
        , 'factor(x)', or 'pd.cut(price, bins=10)')
        """
        for key, item in self.data.items():
            if item not in data:

                def factor(s, levels=None, labels=None):
                    return s.apply(str)

                env = EvalEnvironment.capture(
                    eval_env=(self.__eval_env__ or 1)).with_outer_namespace({
                        "factor":
                        factor,
                        "pd":
                        pd,
                        "np":
                        np
                    })
                try:
                    new_val = env.eval(item, inner_namespace=data)
                    data[item] = new_val
                except:
                    pass
        return data
Example #9
0
    def __init__(self, mapping=None, data=None, environment=None):
        # Allow some sloppiness
        if (isinstance(mapping, pd.DataFrame)
                and (data is None or isinstance(data, aes))):
            mapping, data = data, mapping
        if mapping is None:
            mapping = aes()

        if (data is not None and not isinstance(data, pd.DataFrame)):
            raise PlotnineError('data must be a dataframe or None if each '
                                'layer will have separate data.')

        # Recognize plydata groups
        if hasattr(data, 'group_indices') and 'group' not in mapping:
            mapping = mapping.copy()
            mapping['group'] = data.group_indices()

        self.data = data
        self.mapping = mapping
        self.facet = facet_null()
        self.labels = make_labels(mapping)
        self.layers = Layers()
        self.guides = guides()
        self.scales = Scales()
        self.theme = None
        self.coordinates = coord_cartesian()
        self.environment = environment or EvalEnvironment.capture(1)
        self.layout = None
        self.figure = None
        self.watermarks = []
        self.axs = None
Example #10
0
def test_ModelDesc_from_formula():
    for input in ("y ~ x", parse_formula("y ~ x")):
        eval_env = EvalEnvironment.capture(0)
        md = ModelDesc.from_formula(input, eval_env)
        assert md.lhs_termlist == [
            Term([EvalFactor("y", eval_env)]),
        ]
        assert md.rhs_termlist == [
            INTERCEPT, Term([EvalFactor("x", eval_env)])
        ]
Example #11
0
 def __init__(self, *args, **kwargs):
     if args:
         self.data = dict(zip(self.DEFAULT_ARGS, args))
     else:
         self.data = {}
     if kwargs:
         self.data.update(kwargs)
     if 'colour' in self.data:
         self.data['color'] = self.data['colour']
         del self.data['colour']
     self.__eval_env__ = EvalEnvironment.capture(1)
Example #12
0
def _apply_transforms(data, aes):
    """Adds columns from the aes included transformations

    Possible transformations are "factor(<col>)" and
    expressions which can be used with eval.

    Parameters
    ----------
    data : DataFrame
        the original dataframe
    aes : aesthetics
        the aesthetic

    Returns
    -------
    data : DateFrame
        Transformed DataFrame
    """
    data = data.copy()
    for ae, name in aes.items():
        if (isinstance(name, six.string_types) and (name not in data)):
            # here we assume that it is a transformation
            # if the mapping is to a single value (color="red"), this will be handled by pandas and
            # assigned to the whole index. See also the last case in mapping building in get_layer!
            from patsy.eval import EvalEnvironment

            def factor(s, levels=None, labels=None):
                # TODO: This factor implementation needs improvements...
                # probably only gonna happen after https://github.com/pydata/pandas/issues/5313 is
                # implemented in pandas ...
                if levels or labels:
                    print("factor levels or labels are not yet implemented.")
                return s.apply(str)

            # use either the captured eval_env from aes or use the env one steps up
            env = EvalEnvironment.capture(eval_env=(aes.__eval_env__ or 1))
            # add factor as a special case
            env.add_outer_namespace({"factor": factor})
            try:
                new_val = env.eval(name, inner_namespace=data)
            except Exception as e:
                msg = "Could not evaluate the '%s' mapping: '%s' (original error: %s)"
                raise Exception(msg % (ae, name, str(e)))
            try:
                data[name] = new_val
            except Exception as e:
                msg = """The '%s' mapping: '%s' produced a value of type '%s', but only single items
                and lists/arrays can be used. (original error: %s)"""
                raise Exception(msg % (ae, name, str(type(_new_val)), str(e)))
    return data
Example #13
0
File: desc.py Project: noelhx/patsy
def _do_eval_formula_tests(tests): # pragma: no cover
    for code, result in six.iteritems(tests):
        if len(result) == 2:
            result = (False, []) + result
        eval_env = EvalEnvironment.capture(0)
        model_desc = ModelDesc.from_formula(code, eval_env)
        print(repr(code))
        print(result)
        print(model_desc)
        lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result
        _assert_terms_match(model_desc.lhs_termlist,
                            lhs_intercept, lhs_termlist)
        _assert_terms_match(model_desc.rhs_termlist,
                            rhs_intercept, rhs_termlist)
Example #14
0
def _do_eval_formula_tests(tests):  # pragma: no cover
    for code, result in six.iteritems(tests):
        if len(result) == 2:
            result = (False, []) + result
        eval_env = EvalEnvironment.capture(0)
        model_desc = ModelDesc.from_formula(code, eval_env)
        print(repr(code))
        print(result)
        print(model_desc)
        lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result
        _assert_terms_match(model_desc.lhs_termlist, lhs_intercept,
                            lhs_termlist, eval_env)
        _assert_terms_match(model_desc.rhs_termlist, rhs_intercept,
                            rhs_termlist, eval_env)
Example #15
0
def test_evalfactor_reraise():
    # From issue #11:
    env = EvalEnvironment.capture()
    data = {"X" : [0,1,2,3], "Y" : [1,2,3,4]}
    formula = "C(X) + Y"
    new_data = {"X" : [0,0,1,2,3,3,4], "Y" : [1,2,3,4,5,6,7]}
    info = dmatrix(formula, data)
    # This will produce a PatsyError, which is originally raised within the
    # call to C() (which has no way to know where it is being called
    # from). But EvalFactor should notice this, and add a useful origin:
    try:
        build_design_matrices([info.design_info.builder], new_data)
    except PatsyError, e:
        assert e.origin == Origin(formula, 0, 4)
Example #16
0
File: aes.py Project: eco32i/ggplot
 def __init__(self, *args, **kwargs):
     if args:
         self.data = dict(zip(self.DEFAULT_ARGS, args))
     else:
         self.data = {}
     if kwargs:
         self.data.update(kwargs)
     if "colour" in self.data:
         self.data["color"] = self.data["colour"]
         del self.data["colour"]
     if "linetype" in self.data:
         self.data["linestyle"] = self.data["linetype"]
         del self.data["linetype"]
     self.__eval_env__ = EvalEnvironment.capture(1)
Example #17
0
def incr_dbuilders(formula_like, data_iter_maker, eval_env=0):
    """Construct two design matrix builders incrementally from a large data
    set.

    :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is
    to :func:`dmatrix`. See :func:`incr_dbuilder` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    builders = _try_incr_builders(formula_like, data_iter_maker, eval_env)
    if builders is None:
        raise PatsyError("bad formula-like object")
    if len(builders[0].design_info.column_names) == 0:
        raise PatsyError("model is missing required outcome variables")
    return builders
Example #18
0
def dmatrices(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"):
    """Construct two design matrices given a formula_like and data.

    This function is identical to :func:`dmatrix`, except that it requires
    (and returns) two matrices instead of one. By convention, the first matrix
    is the "outcome" or "y" data, and the second is the "predictor" or "x"
    data.

    See :func:`dmatrix` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
    if lhs.shape[1] == 0:
        raise PatsyError("model is missing required outcome variables")
    return (lhs, rhs)
Example #19
0
def _apply_transforms(data, aes):
    """Adds columns from the aes included transformations

    Possible transformations are "factor(<col>)" and
    expressions which can be used with eval.

    Parameters
    ----------
    data : DataFrame
        the original dataframe
    aes : aesthetics
        the aesthetic

    Returns
    -------
    data : DateFrame
        Transformed DataFrame
    """
    data = data.copy()
    for ae, name in aes.items():
        if (isinstance(name, six.string_types) and (name not in data)):
            # here we assume that it is a transformation
            # if the mapping is to a single value (color="red"), this will be handled by pandas and
            # assigned to the whole index. See also the last case in mapping building in get_layer!
            from patsy.eval import EvalEnvironment
            def factor(s, levels=None, labels=None):
                # TODO: This factor implementation needs improvements...
                # probably only gonna happen after https://github.com/pydata/pandas/issues/5313 is
                # implemented in pandas ...
                if levels or labels:
                    print("factor levels or labels are not yet implemented.")
                return s.apply(str)
            # use either the captured eval_env from aes or use the env one steps up
            env = EvalEnvironment.capture(eval_env=(aes.__eval_env__ or 1))
            # add factor as a special case
            env.add_outer_namespace({"factor":factor})
            try:
                new_val = env.eval(name, inner_namespace=data)
            except Exception as e:
                msg = "Could not evaluate the '%s' mapping: '%s' (original error: %s)"
                raise Exception(msg % (ae, name, str(e)))
            try:
                data[name] = new_val
            except Exception as e:
                msg = """The '%s' mapping: '%s' produced a value of type '%s', but only single items
                and lists/arrays can be used. (original error: %s)"""
                raise Exception(msg % (ae, name, str(type(new_val)), str(e)))
    return data
Example #20
0
    def _evaluate_aes_expressions(self):
        """
        Evaluates patsy expressions within the aesthetics. For example, 'x + 1'
        , 'factor(x)', or 'pd.cut(price, bins=10)')
        """
        for key, item in self._aes.items():
            if item not in self.data:
                def factor(s, levels=None, labels=None):
                    return s.apply(str)

                env = EvalEnvironment.capture(eval_env=(self._aes.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np })
                try:
                    new_val = env.eval(item, inner_namespace=self.data)
                    self.data[item] = new_val
                except:
                    pass
Example #21
0
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
    """Construct a design matrix builder incrementally from a large data set.

    :arg formula_like: Similar to :func:`dmatrix`, except that explicit
      matrices are not allowed. Must be a formula string, a
      :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a
      ``__patsy_get_model_desc__`` method.
    :arg data_iter_maker: A zero-argument callable which returns an iterator
      over dict-like data objects. This must be a callable rather than a
      simple iterator because sufficiently complex formulas may require
      multiple passes over the data (e.g. if there are nested stateful
      transforms).
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `formula_like` that cannot be
      found in `data`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`incr_dbuilder` for lookups. If calling this function from a
      library, you probably want ``eval_env=1``, which means that variables
      should be resolved in *your* caller's namespace.
    :arg NA_action: An :class:`NAAction` object or string, used to determine
      what values count as 'missing' for purposes of determining the levels of
      categorical factors.
    :returns: A :class:`DesignInfo`

    Tip: for `data_iter_maker`, write a generator like::

      def iter_maker():
          for data_chunk in my_data_store:
              yield data_chunk

    and pass `iter_maker` (*not* `iter_maker()`).

    .. versionadded:: 0.2.0
       The ``NA_action`` argument.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                      NA_action)
    if design_infos is None:
        raise PatsyError("bad formula-like object")
    if len(design_infos[0].column_names) > 0:
        raise PatsyError("encountered outcome variables for a model "
                         "that does not expect them")
    return design_infos[1]
Example #22
0
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
    """Construct a design matrix builder incrementally from a large data set.

    :arg formula_like: Similar to :func:`dmatrix`, except that explicit
      matrices are not allowed. Must be a formula string, a
      :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a
      ``__patsy_get_model_desc__`` method.
    :arg data_iter_maker: A zero-argument callable which returns an iterator
      over dict-like data objects. This must be a callable rather than a
      simple iterator because sufficiently complex formulas may require
      multiple passes over the data (e.g. if there are nested stateful
      transforms).
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `formula_like` that cannot be
      found in `data`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`incr_dbuilder` for lookups. If calling this function from a
      library, you probably want ``eval_env=1``, which means that variables
      should be resolved in *your* caller's namespace.
    :arg NA_action: An :class:`NAAction` object or string, used to determine
      what values count as 'missing' for purposes of determining the levels of
      categorical factors.
    :returns: A :class:`DesignInfo`

    Tip: for `data_iter_maker`, write a generator like::

      def iter_maker():
          for data_chunk in my_data_store:
              yield data_chunk

    and pass `iter_maker` (*not* `iter_maker()`).

    .. versionadded:: 0.2.0
       The ``NA_action`` argument.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                      NA_action)
    if design_infos is None:
        raise PatsyError("bad formula-like object")
    if len(design_infos[0].column_names) > 0:
        raise PatsyError("encountered outcome variables for a model "
                         "that does not expect them")
    return design_infos[1]
Example #23
0
def incr_dbuilders(formula_like,
                   data_iter_maker,
                   eval_env=0,
                   NA_action="drop"):
    """Construct two design matrix builders incrementally from a large data
    set.

    :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is
    to :func:`dmatrix`. See :func:`incr_dbuilder` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    builders = _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                  NA_action)
    if builders is None:
        raise PatsyError("bad formula-like object")
    if len(builders[0].design_info.column_names) == 0:
        raise PatsyError("model is missing required outcome variables")
    return builders
Example #24
0
def Q(name):
    """A way to 'quote' variable names, especially ones that do not otherwise
    meet Python's variable name rules.

    If ``x`` is a variable, ``Q("x")`` returns the value of ``x``. (Note that
    ``Q`` takes the *string* ``"x"``, not the value of ``x`` itself.) This
    works even if instead of ``x``, we have a variable name that would not
    otherwise be legal in Python.

    For example, if you have a column of data named `weight.in.kg`, then you
    can't write::

      y ~ weight.in.kg

    because Python will try to find a variable named ``weight``, that has an
    attribute named ``in``, that has an attribute named ``kg``. (And worse
    yet, ``in`` is a reserved word, which makes this example doubly broken.)
    Instead, write::

      y ~ Q("weight.in.kg")

    and all will be well. Note, though, that this requires embedding a Python
    string inside your formula, which may require some care with your quote
    marks. Some standard options include::

      my_fit_function("y ~ Q('weight.in.kg')", ...)
      my_fit_function('y ~ Q("weight.in.kg")', ...)
      my_fit_function("y ~ Q(\\"weight.in.kg\\")", ...)

    Note also that ``Q`` is an ordinary Python function, which means that you
    can use it in more complex expressions. For example, this is a legal
    formula::

      y ~ np.sqrt(Q("weight.in.kg"))
    """
    from patsy.eval import EvalEnvironment

    env = EvalEnvironment.capture(1)
    try:
        return env.namespace[name]
    except KeyError:
        raise NameError, "no data named %r found" % (name,)
Example #25
0
def Q(name):
    """A way to 'quote' variable names, especially ones that do not otherwise
    meet Python's variable name rules.

    If ``x`` is a variable, ``Q("x")`` returns the value of ``x``. (Note that
    ``Q`` takes the *string* ``"x"``, not the value of ``x`` itself.) This
    works even if instead of ``x``, we have a variable name that would not
    otherwise be legal in Python.

    For example, if you have a column of data named `weight.in.kg`, then you
    can't write::

      y ~ weight.in.kg

    because Python will try to find a variable named ``weight``, that has an
    attribute named ``in``, that has an attribute named ``kg``. (And worse
    yet, ``in`` is a reserved word, which makes this example doubly broken.)
    Instead, write::

      y ~ Q("weight.in.kg")

    and all will be well. Note, though, that this requires embedding a Python
    string inside your formula, which may require some care with your quote
    marks. Some standard options include::

      my_fit_function("y ~ Q('weight.in.kg')", ...)
      my_fit_function('y ~ Q("weight.in.kg")', ...)
      my_fit_function("y ~ Q(\\"weight.in.kg\\")", ...)

    Note also that ``Q`` is an ordinary Python function, which means that you
    can use it in more complex expressions. For example, this is a legal
    formula::

      y ~ np.sqrt(Q("weight.in.kg"))
    """
    from patsy.eval import EvalEnvironment
    env = EvalEnvironment.capture(1)
    try:
        return env.namespace[name]
    except KeyError:
        raise NameError, "no data named %r found" % (name, )
Example #26
0
    def _evaluate_expressions(self, data):
        """
        Evaluates patsy expressions within the aesthetics. For example, 'x + 1'
        , 'factor(x)', or 'pd.cut(price, bins=10)')
        """
        for key, item in self.data.items():
            if item not in data:
                def factor(s, levels=None, labels=None):
                    return s.apply(str)

                env = EvalEnvironment.capture(eval_env=(self.__eval_env__ or 1)).with_outer_namespace({ "factor": factor, "pd": pd, "np": np })
                try:
                    new_val = env.eval(item, inner_namespace=data)
                    data[item] = new_val
                except:
                    msg = "Invalid column: '%s'" % str(item)
                    matches = difflib.get_close_matches(item, data.columns)
                    msg += "\ndid you mean one of the following:\n"
                    for match in matches:
                        msg += "    - %s\n" % match
                    raise Exception(msg)
        return data
Example #27
0
    def __init__(self, mapping=None, data=None, environment=None):
        # Allow some sloppiness
        if not isinstance(mapping, aes):
            mapping, data = data, mapping
        if mapping is None:
            mapping = aes()

        if (data is not None and not isinstance(data, pd.DataFrame)):
            raise PlotnineError('data must be a dataframe or None if each '
                                'layer will have separate data.')

        self.data = data
        self.mapping = mapping
        self.facet = facet_null()
        self.labels = make_labels(mapping)
        self.layers = Layers()
        self.guides = guides()
        self.scales = Scales()
        self.theme = None
        self.coordinates = coord_cartesian()
        self.environment = environment or EvalEnvironment.capture(1)
        self.layout = None
        self.figure = None
Example #28
0
File: desc.py Project: noelhx/patsy
def test_ModelDesc_from_formula():
    for input in ("y ~ x", parse_formula("y ~ x")):
        eval_env = EvalEnvironment.capture(0)
        md = ModelDesc.from_formula(input, eval_env)
        assert md.lhs_termlist == [Term([EvalFactor("y")]),]
        assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])]
Example #29
0
    def subset(self, which_terms):
        """Create a new :class:`DesignMatrixBuilder` that includes only a
        subset of the terms that this object does.

        For example, if `builder` has terms `x`, `y`, and `z`, then::

          builder2 = builder.subset(["x", "z"])

        will return a new builder that will return design matrices with only
        the columns corresponding to the terms `x` and `z`. After we do this,
        then in general these two expressions will return the same thing (here
        we assume that `x`, `y`, and `z` each generate a single column of the
        output)::

          build_design_matrix([builder], data)[0][:, [0, 2]]
          build_design_matrix([builder2], data)[0]

        However, a critical difference is that in the second case, `data` need
        not contain any values for `y`. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for `y`.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        `0` or `-1` in your formula if you want to avoid this.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
        """
        factor_to_evaluators = {}
        for evaluator in self._evaluators:
            factor_to_evaluators[evaluator.factor] = evaluator
        design_info = self.design_info
        term_name_to_term = dict(zip(design_info.term_names,
                                     design_info.terms))
        if isinstance(which_terms, basestring):
            # We don't use this EvalEnvironment -- all we want to do is to
            # find matching terms, and we can't do that use == on Term
            # objects, because that calls == on factor objects, which in turn
            # compares EvalEnvironments. So all we do with the parsed formula
            # is pull out the term *names*, which the EvalEnvironment doesn't
            # effect. This is just a placeholder then to allow the ModelDesc
            # to be created:
            env = EvalEnvironment({})
            desc = ModelDesc.from_formula(which_terms, env)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]
        terms = []
        evaluators = set()
        term_to_column_builders = {}
        for term_or_name in which_terms:
            if isinstance(term_or_name, basestring):
                if term_or_name not in term_name_to_term:
                    raise PatsyError("requested term %r not found in "
                                     "this DesignMatrixBuilder" %
                                     (term_or_name, ))
                term = term_name_to_term[term_or_name]
            else:
                term = term_or_name
            if term not in self._termlist:
                raise PatsyError("requested term '%s' not found in this "
                                 "DesignMatrixBuilder" % (term, ))
            for factor in term.factors:
                evaluators.add(factor_to_evaluators[factor])
            terms.append(term)
            column_builder = self._term_to_column_builders[term]
            term_to_column_builders[term] = column_builder
        return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
Example #30
0
    def compute_aesthetics(self, plot):
        """
        Return a dataframe where the columns match the
        aesthetic mappings.

        Transformations like 'factor(cyl)' and other
        expression evaluation are  made in here
        """
        data = self.data
        aesthetics = self.layer_mapping(plot.mapping)

        # Override grouping if set in layer.
        with suppress(KeyError):
            aesthetics['group'] = self.geom.aes_params['group']

        env = EvalEnvironment.capture(eval_env=plot.environment)
        env = env.with_outer_namespace({'factor': pd.Categorical})

        # Using `type` preserves the subclass of pd.DataFrame
        evaled = type(data)(index=data.index)

        # If a column name is not in the data, it is evaluated/transformed
        # in the environment of the call to ggplot
        for ae, col in aesthetics.items():
            if isinstance(col, six.string_types):
                if col in data:
                    evaled[ae] = data[col]
                else:
                    try:
                        new_val = env.eval(col, inner_namespace=data)
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_EVAL_FAIL.format(ae, col, str(e)))

                    try:
                        evaled[ae] = new_val
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_BAD_EVAL_TYPE.format(
                                ae, col, str(type(new_val)), str(e)))
            elif pdtypes.is_list_like(col):
                n = len(col)
                if len(data) and n != len(data) and n != 1:
                    raise PlotnineError(
                        "Aesthetics must either be length one, " +
                        "or the same length as the data")
                # An empty dataframe does not admit a scalar value
                elif len(evaled) and n == 1:
                    col = col[0]
                evaled[ae] = col
            elif is_known_scalar(col):
                if not len(evaled):
                    col = [col]
                evaled[ae] = col
            else:
                msg = "Do not know how to deal with aesthetic '{}'"
                raise PlotnineError(msg.format(ae))

        evaled_aes = aes(**dict((col, col) for col in evaled))
        plot.scales.add_defaults(evaled, evaled_aes)

        if len(data) == 0 and len(evaled) > 0:
            # No data, and vectors suppled to aesthetics
            evaled['PANEL'] = 1
        else:
            evaled['PANEL'] = data['PANEL']

        self.data = add_group(evaled)
Example #31
0
def test_eval_formula_error_reporting():
    from patsy.parse_formula import _parsing_error_test
    parse_fn = lambda formula: ModelDesc.from_formula(formula,
                                                      EvalEnvironment.capture(0))
    _parsing_error_test(parse_fn, _eval_error_tests)
Example #32
0
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"])

    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]], ), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]],
          ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]],
          ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]
                             }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6],
                         name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False,
          [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame(
            [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])

    foreign_model = ForeignModelSource()
    t(foreign_model, {
        "Y": [1, 2],
        "X": [[1, 2], [3, 4]]
    }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"])

    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data

    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"])
    t("~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]],
      ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {
        "x": [1.5, 2.5, 3.5],
        "y": [10, 20, 30]
    }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = (
        [],
        [Term([LookupFactor("x")])],
        [Term([]), Term([LookupFactor("x")])],
    )
    builders = design_matrix_builders(termlists, lambda: iter([{
        "x": [1, 2, 3]
    }]))
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignMatrixBuilder
    t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]],
      ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]],
      ["x"])

    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]],
      ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))

    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)

    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call_2()
Example #33
0
File: desc.py Project: noelhx/patsy
def test_eval_formula_error_reporting():
    from patsy.parse_formula import _parsing_error_test
    parse_fn = lambda formula: ModelDesc.from_formula(formula,
                                                      EvalEnvironment.capture(0))
    _parsing_error_test(parse_fn, _eval_error_tests)
Example #34
0
def design_matrix_builders(termlists, data_iter_maker, eval_env,
                           NA_action="drop"):
    """Construct several :class:`DesignMatrixBuilders` from termlists.

    This is one of Patsy's fundamental functions. This function and
    :func:`build_design_matrices` together form the API to the core formula
    interpretation machinery.

    :arg termlists: A list of termlists, where each termlist is a list of
      :class:`Term` objects which together specify a design matrix.
    :arg data_iter_maker: A zero-argument callable which returns an iterator
      over dict-like data objects. This must be a callable rather than a
      simple iterator because sufficiently complex formulas may require
      multiple passes over the data (e.g. if there are nested stateful
      transforms).
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `termlists` that cannot be
      found in `data_iter_maker`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`design_matrix_builders` for lookups. If calling this function
      from a library, you probably want ``eval_env=1``, which means that
      variables should be resolved in *your* caller's namespace.
    :arg NA_action: An :class:`NAAction` object or string, used to determine
      what values count as 'missing' for purposes of determining the levels of
      categorical factors.
    :returns: A list of :class:`DesignMatrixBuilder` objects, one for each
      termlist passed in.

    This function performs zero or more iterations over the data in order to
    sniff out any necessary information about factor types, set up stateful
    transforms, pick column names, etc.

    See :ref:`formulas` for details.

    .. versionadded:: 0.2.0
       The ``NA_action`` argument.
    .. versionadded:: 0.4.0
       The ``eval_env`` argument.
    """
    # People upgrading from versions prior to 0.4.0 could potentially have
    # passed NA_action as the 3rd positional argument. Fortunately
    # EvalEnvironment.capture only accepts int and EvalEnvironment objects,
    # and we improved its error messages to make this clear.
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    if isinstance(NA_action, str):
        NA_action = NAAction(NA_action)
    all_factors = set()
    for termlist in termlists:
        for term in termlist:
            all_factors.update(term.factors)
    factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env)
    # Now all the factors have working eval methods, so we can evaluate them
    # on some data to find out what type of data they return.
    (num_column_counts,
     cat_levels_contrasts) = _examine_factor_types(all_factors,
                                                   factor_states,
                                                   data_iter_maker,
                                                   NA_action)
    # Now we need the factor evaluators, which encapsulate the knowledge of
    # how to turn any given factor into a chunk of data:
    factor_evaluators = {}
    for factor in all_factors:
        if factor in num_column_counts:
            evaluator = _NumFactorEvaluator(factor,
                                            factor_states[factor],
                                            num_column_counts[factor])
        else:
            assert factor in cat_levels_contrasts
            levels = cat_levels_contrasts[factor][0]
            evaluator = _CatFactorEvaluator(factor, factor_states[factor],
                                            levels)
        factor_evaluators[factor] = evaluator
    # And now we can construct the DesignMatrixBuilder for each termlist:
    builders = []
    for termlist in termlists:
        result = _make_term_column_builders(termlist,
                                            num_column_counts,
                                            cat_levels_contrasts)
        new_term_order, term_to_column_builders = result
        assert frozenset(new_term_order) == frozenset(termlist)
        term_evaluators = set()
        for term in termlist:
            for factor in term.factors:
                term_evaluators.add(factor_evaluators[factor])
        builders.append(DesignMatrixBuilder(new_term_order,
                                            term_evaluators,
                                            term_to_column_builders))
    return builders
Example #35
0
def qplot(x=None, y=None, data=None, facets=None, margins=False,
          geom='auto', xlim=None, ylim=None, log='', main=None,
          xlab=None, ylab=None, asp=None, **kwargs):
    """
    Quick plot

    Parameters
    ----------
    x : str | array_like
        x aesthetic
    y : str | array_like
        y aesthetic
    data : dataframe
        Data frame to use (optional). If not specified,
        will create one, extracting arrays from the
        current environment.
    geom : str | list
        *geom(s)* to do the drawing. If ``auto``, defaults
        to 'point' if ``x`` and ``y`` are specified or
        'histogram' if only ``x`` is specified.
    xlim : tuple
        x-axis limits
    ylim : tuple
        y-axis limits
    log : str in ``{'x', 'y', 'xy'}``
        Which variables to log transform.
    main : str
        Plot title
    xlab : str
        x-axis label
    ylab : str
        y-axis label
    asp : str | float
        The y/x aspect ratio.
    **kwargs : dict
        Arguments passed on to the geom.

    Returns
    -------
    p: ggplot
        ggplot object
    """
    # Extract all recognizable aesthetic mappings from the parameters
    # String values e.g  "I('red')", "I(4)" are not treated as mappings

    environment = EvalEnvironment.capture(1)
    aesthetics = {} if x is None else {'x': x}
    if y is not None:
        aesthetics['y'] = y

    def is_mapping(value):
        """
        Return True if value is not enclosed in I() function
        """
        with suppress(AttributeError):
            return not (value.startswith('I(') and value.endswith(')'))
        return True

    def I(value):
        return value

    I_env = EvalEnvironment([{'I': I}])

    for ae in six.viewkeys(kwargs) & all_aesthetics:
        value = kwargs[ae]
        if is_mapping(value):
            aesthetics[ae] = value
        else:
            kwargs[ae] = I_env.eval(value)

    # List of geoms
    if is_string(geom):
        geom = [geom]
    elif isinstance(geom, tuple):
        geom = list(geom)

    if data is None:
        data = pd.DataFrame()

    # Work out plot data, and modify aesthetics, if necessary
    def replace_auto(lst, str2):
        """
        Replace all occurences of 'auto' in with str2
        """
        for i, value in enumerate(lst):
            if value == 'auto':
                lst[i] = str2
        return lst

    if 'auto' in geom:
        if 'sample' in aesthetics:
            replace_auto(geom, 'qq')
        elif y is None:
            # If x is discrete we choose geom_bar &
            # geom_histogram otherwise. But we need to
            # evaluate the mapping to find out the dtype
            env = environment.with_outer_namespace(
                {'factor': pd.Categorical})

            if isinstance(aesthetics['x'], six.string_types):
                try:
                    x = env.eval(aesthetics['x'], inner_namespace=data)
                except Exception:
                    msg = "Could not evaluate aesthetic 'x={}'"
                    raise PlotnineError(msg.format(aesthetics['x']))
            elif not hasattr(aesthetics['x'], 'dtype'):
                x = np.asarray(aesthetics['x'])

            if array_kind.discrete(x):
                replace_auto(geom, 'bar')
            else:
                replace_auto(geom, 'histogram')

        else:
            if x is None:
                if pdtypes.is_list_like(aesthetics['y']):
                    aesthetics['x'] = range(len(aesthetics['y']))
                    xlab = 'range(len(y))'
                    ylab = 'y'
                else:
                    # We could solve the issue in layer.compute_asthetics
                    # but it is not worth the extra complexity
                    raise PlotnineError(
                        "Cannot infer how long x should be.")
            replace_auto(geom, 'point')

    p = ggplot(aes(**aesthetics), data=data, environment=environment)

    def get_facet_type(facets):
        with suppress(PlotnineError):
            parse_grid_facets(facets)
            return 'grid'

        with suppress(PlotnineError):
            parse_wrap_facets(facets)
            return 'wrap'

        warn("Could not determine the type of faceting, "
             "therefore no faceting.")
        return 'null'

    if facets:
        facet_type = get_facet_type(facets)
        if facet_type == 'grid':
            p += facet_grid(facets, margins=margins)
        elif facet_type == 'wrap':
            p += facet_wrap(facets)
        else:
            p += facet_null()

    # Add geoms
    for g in geom:
        geom_name = 'geom_{}'.format(g)
        geom_klass = Registry[geom_name]
        stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat'])
        stat_klass = Registry[stat_name]
        # find params
        recognized = (six.viewkeys(kwargs) &
                      (six.viewkeys(geom_klass.DEFAULT_PARAMS) |
                       geom_klass.aesthetics() |
                       six.viewkeys(stat_klass.DEFAULT_PARAMS) |
                       stat_klass.aesthetics()))
        recognized = recognized - six.viewkeys(aesthetics)
        params = {ae: kwargs[ae] for ae in recognized}
        p += geom_klass(**params)

    # pd.Series objects have name attributes. In a dataframe, the
    # series have the name of the column.
    labels = {}
    for ae in scaled_aesthetics & six.viewkeys(kwargs):
        with suppress(AttributeError):
            labels[ae] = kwargs[ae].name

    with suppress(AttributeError):
        labels['x'] = xlab if xlab is not None else x.name

    with suppress(AttributeError):
        labels['y'] = ylab if ylab is not None else y.name

    if main is not None:
        labels['title'] = main

    if 'x' in log:
        p += scale_x_log10()

    if 'y' in log:
        p += scale_y_log10()

    if labels:
        p += labs(**labels)

    if asp:
        p += theme(aspect_ratio=asp)

    return p
Example #36
0
def _get_env(eval_env):
    if isinstance(eval_env, int):
        # Here eval_env=0 refers to our caller's caller.
        return EvalEnvironment.capture(eval_env + 2)
    return eval_env
Example #37
0
def qplot(x=None,
          y=None,
          data=None,
          facets=None,
          margins=False,
          geom='auto',
          xlim=None,
          ylim=None,
          log='',
          main=None,
          xlab=None,
          ylab=None,
          asp=None,
          **kwargs):
    """
    Quick plot

    Parameters
    ----------
    x : str | array_like
        x aesthetic
    y : str | array_like
        y aesthetic
    data : dataframe
        Data frame to use (optional). If not specified,
        will create one, extracting arrays from the
        current environment.
    geom : str | list
        *geom(s)* to do the drawing. If ``auto``, defaults
        to 'point' if ``x`` and ``y`` are specified or
        'histogram' if only ``x`` is specified.
    xlim : tuple
        x-axis limits
    ylim : tuple
        y-axis limits
    log : str in ``{'x', 'y', 'xy'}``
        Which variables to log transform.
    main : str
        Plot title
    xlab : str
        x-axis label
    ylab : str
        y-axis label
    asp : str | float
        The y/x aspect ratio.
    **kwargs : dict
        Arguments passed on to the geom.

    Returns
    -------
    p: ggplot
        ggplot object
    """
    # Extract all recognizable aesthetic mappings from the parameters
    # String values e.g  "I('red')", "I(4)" are not treated as mappings

    environment = EvalEnvironment.capture(1)
    aesthetics = {} if x is None else {'x': x}
    if y is not None:
        aesthetics['y'] = y

    def is_mapping(value):
        """
        Return True if value is not enclosed in I() function
        """
        with suppress(AttributeError):
            return not (value.startswith('I(') and value.endswith(')'))
        return True

    def I(value):
        return value

    I_env = EvalEnvironment([{'I': I}])

    for ae in kwargs.keys() & all_aesthetics:
        value = kwargs[ae]
        if is_mapping(value):
            aesthetics[ae] = value
        else:
            kwargs[ae] = I_env.eval(value)

    # List of geoms
    if is_string(geom):
        geom = [geom]
    elif isinstance(geom, tuple):
        geom = list(geom)

    if data is None:
        data = pd.DataFrame()

    # Work out plot data, and modify aesthetics, if necessary
    def replace_auto(lst, str2):
        """
        Replace all occurences of 'auto' in with str2
        """
        for i, value in enumerate(lst):
            if value == 'auto':
                lst[i] = str2
        return lst

    if 'auto' in geom:
        if 'sample' in aesthetics:
            replace_auto(geom, 'qq')
        elif y is None:
            # If x is discrete we choose geom_bar &
            # geom_histogram otherwise. But we need to
            # evaluate the mapping to find out the dtype
            env = environment.with_outer_namespace({'factor': pd.Categorical})

            if isinstance(aesthetics['x'], str):
                try:
                    x = env.eval(aesthetics['x'], inner_namespace=data)
                except Exception:
                    msg = "Could not evaluate aesthetic 'x={}'"
                    raise PlotnineError(msg.format(aesthetics['x']))
            elif not hasattr(aesthetics['x'], 'dtype'):
                x = np.asarray(aesthetics['x'])

            if array_kind.discrete(x):
                replace_auto(geom, 'bar')
            else:
                replace_auto(geom, 'histogram')

        else:
            if x is None:
                if pdtypes.is_list_like(aesthetics['y']):
                    aesthetics['x'] = range(len(aesthetics['y']))
                    xlab = 'range(len(y))'
                    ylab = 'y'
                else:
                    # We could solve the issue in layer.compute_asthetics
                    # but it is not worth the extra complexity
                    raise PlotnineError("Cannot infer how long x should be.")
            replace_auto(geom, 'point')

    p = ggplot(aes(**aesthetics), data=data, environment=environment)

    def get_facet_type(facets):
        with suppress(PlotnineError):
            parse_grid_facets(facets)
            return 'grid'

        with suppress(PlotnineError):
            parse_wrap_facets(facets)
            return 'wrap'

        warn(
            "Could not determine the type of faceting, "
            "therefore no faceting.", PlotnineWarning)
        return 'null'

    if facets:
        facet_type = get_facet_type(facets)
        if facet_type == 'grid':
            p += facet_grid(facets, margins=margins)
        elif facet_type == 'wrap':
            p += facet_wrap(facets)
        else:
            p += facet_null()

    # Add geoms
    for g in geom:
        geom_name = 'geom_{}'.format(g)
        geom_klass = Registry[geom_name]
        stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat'])
        stat_klass = Registry[stat_name]
        # find params
        recognized = (
            kwargs.keys() &
            (geom_klass.DEFAULT_PARAMS.keys() | geom_klass.aesthetics()
             | stat_klass.DEFAULT_PARAMS.keys() | stat_klass.aesthetics()))
        recognized = recognized - aesthetics.keys()
        params = {ae: kwargs[ae] for ae in recognized}
        p += geom_klass(**params)

    # pd.Series objects have name attributes. In a dataframe, the
    # series have the name of the column.
    labels = {}
    for ae in scaled_aesthetics & kwargs.keys():
        with suppress(AttributeError):
            labels[ae] = kwargs[ae].name

    with suppress(AttributeError):
        labels['x'] = xlab if xlab is not None else x.name

    with suppress(AttributeError):
        labels['y'] = ylab if ylab is not None else y.name

    if main is not None:
        labels['title'] = main

    if 'x' in log:
        p += scale_x_log10()

    if 'y' in log:
        p += scale_y_log10()

    if labels:
        p += labs(**labels)

    if asp:
        p += theme(aspect_ratio=asp)

    return p
Example #38
0
    def compute_aesthetics(self, plot):
        """
        Return a dataframe where the columns match the
        aesthetic mappings.

        Transformations like 'factor(cyl)' and other
        expression evaluation are  made in here
        """
        data = self.data
        aesthetics = self.layer_mapping(plot.mapping)

        # Override grouping if set in layer.
        with suppress(KeyError):
            aesthetics['group'] = self.geom.aes_params['group']

        env = EvalEnvironment.capture(eval_env=plot.environment)
        env = env.with_outer_namespace({'factor': pd.Categorical})

        # Using `type` preserves the subclass of pd.DataFrame
        evaled = type(data)(index=data.index)

        # If a column name is not in the data, it is evaluated/transformed
        # in the environment of the call to ggplot
        for ae, col in aesthetics.items():
            if isinstance(col, str):
                if col in data:
                    evaled[ae] = data[col]
                else:
                    try:
                        new_val = env.eval(col, inner_namespace=data)
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_EVAL_FAIL.format(ae, col, str(e)))

                    try:
                        evaled[ae] = new_val
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_BAD_EVAL_TYPE.format(
                                ae, col, str(type(new_val)), str(e)))
            elif pdtypes.is_list_like(col):
                n = len(col)
                if len(data) and n != len(data) and n != 1:
                    raise PlotnineError(
                        "Aesthetics must either be length one, " +
                        "or the same length as the data")
                # An empty dataframe does not admit a scalar value
                elif len(evaled) and n == 1:
                    col = col[0]
                evaled[ae] = col
            elif is_known_scalar(col):
                if not len(evaled):
                    col = [col]
                evaled[ae] = col
            else:
                msg = "Do not know how to deal with aesthetic '{}'"
                raise PlotnineError(msg.format(ae))

        evaled_aes = aes(**dict((col, col) for col in evaled))
        plot.scales.add_defaults(evaled, evaled_aes)

        if len(data) == 0 and len(evaled) > 0:
            # No data, and vectors suppled to aesthetics
            evaled['PANEL'] = 1
        else:
            evaled['PANEL'] = data['PANEL']

        self.data = add_group(evaled)
Example #39
0
def design_matrix_builders(termlists, data_iter_maker, eval_env,
                           NA_action="drop"):
    """Construct several :class:`DesignInfo` objects from termlists.

    This is one of Patsy's fundamental functions. This function and
    :func:`build_design_matrices` together form the API to the core formula
    interpretation machinery.

    :arg termlists: A list of termlists, where each termlist is a list of
      :class:`Term` objects which together specify a design matrix.
    :arg data_iter_maker: A zero-argument callable which returns an iterator
      over dict-like data objects. This must be a callable rather than a
      simple iterator because sufficiently complex formulas may require
      multiple passes over the data (e.g. if there are nested stateful
      transforms).
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `termlists` that cannot be
      found in `data_iter_maker`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`design_matrix_builders` for lookups. If calling this function
      from a library, you probably want ``eval_env=1``, which means that
      variables should be resolved in *your* caller's namespace.
    :arg NA_action: An :class:`NAAction` object or string, used to determine
      what values count as 'missing' for purposes of determining the levels of
      categorical factors.
    :returns: A list of :class:`DesignInfo` objects, one for each
      termlist passed in.

    This function performs zero or more iterations over the data in order to
    sniff out any necessary information about factor types, set up stateful
    transforms, pick column names, etc.

    See :ref:`formulas` for details.

    .. versionadded:: 0.2.0
       The ``NA_action`` argument.
    .. versionadded:: 0.4.0
       The ``eval_env`` argument.
    """
    # People upgrading from versions prior to 0.4.0 could potentially have
    # passed NA_action as the 3rd positional argument. Fortunately
    # EvalEnvironment.capture only accepts int and EvalEnvironment objects,
    # and we improved its error messages to make this clear.
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    if isinstance(NA_action, str):
        NA_action = NAAction(NA_action)
    all_factors = set()
    for termlist in termlists:
        for term in termlist:
            all_factors.update(term.factors)
    factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env)
    # Now all the factors have working eval methods, so we can evaluate them
    # on some data to find out what type of data they return.
    (num_column_counts,
     cat_levels_contrasts) = _examine_factor_types(all_factors,
                                                   factor_states,
                                                   data_iter_maker,
                                                   NA_action)
    # Now we need the factor infos, which encapsulate the knowledge of
    # how to turn any given factor into a chunk of data:
    factor_infos = {}
    for factor in all_factors:
        if factor in num_column_counts:
            fi = FactorInfo(factor,
                            "numerical",
                            factor_states[factor],
                            num_columns=num_column_counts[factor],
                            categories=None)
        else:
            assert factor in cat_levels_contrasts
            categories = cat_levels_contrasts[factor][0]
            fi = FactorInfo(factor,
                            "categorical",
                            factor_states[factor],
                            num_columns=None,
                            categories=categories)
        factor_infos[factor] = fi
    # And now we can construct the DesignInfo for each termlist:
    design_infos = []
    for termlist in termlists:
        term_to_subterm_infos = _make_subterm_infos(termlist,
                                                    num_column_counts,
                                                    cat_levels_contrasts)
        assert isinstance(term_to_subterm_infos, OrderedDict)
        assert frozenset(term_to_subterm_infos) == frozenset(termlist)
        this_design_factor_infos = {}
        for term in termlist:
            for factor in term.factors:
                this_design_factor_infos[factor] = factor_infos[factor]
        column_names = []
        for subterms in six.itervalues(term_to_subterm_infos):
            for subterm in subterms:
                for column_name in _subterm_column_names_iter(
                        factor_infos, subterm):
                    column_names.append(column_name)
        design_infos.append(DesignInfo(column_names,
                                       factor_infos=this_design_factor_infos,
                                       term_codings=term_to_subterm_infos))
    return design_infos
Example #40
0
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
      
    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"],
      [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]],), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0,
          False,
          [[1], [2], [3]], ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0,
          False,
          [[1], [2], [3]], ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]}),
           pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False,
          [[1], [2], [3]], ["x"],
          [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6], name="y"),
           pandas.Series([1, 2, 3], name="x")), {}, 0,
          False,
          [[1], [2], [3]], ["x"],
          [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0,
          False,
          [[1, 2, 3]], ["x7", "x8", "x9"],
          [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0,
          False,
          [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame([[1]], index=[1]),
                   pandas.DataFrame([[1]], index=[2])),
                  {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])
    foreign_model = ForeignModelSource()
    t(foreign_model,
      {"Y": [1, 2],
       "X": [[1, 2], [3, 4]]},
      0,
      True,
      [[1, 2], [3, 4]], ["X[0]", "X[1]"],
      [[1], [2]], ["Y"])
    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data
    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3], [1, 4]], ["Intercept", "x"],
      [[1], [2]], ["y"])
    t("~ x", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])
    
    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0,
      True,
      [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0,
      True,
      [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5], "y": [10, 20, 30]}, 0,
      True,
      [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = ([],
                 [Term([LookupFactor("x")])],
                 [Term([]), Term([LookupFactor("x")])],
                 )
    builders = design_matrix_builders(termlists,
                                      lambda: iter([{"x": [1, 2, 3]}]))
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignMatrixBuilder
    t(builders[2], {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"],
      [[10], [20], [30]], ["x"])
    
    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0,
      True,
      [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))
    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1,
          True,
          [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)
    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e,
          True,
          [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    check_nested_call_2()
Example #41
0
def dmatrix(formula_like,
            data={},
            eval_env=0,
            NA_action="drop",
            return_type="matrix"):
    """Construct a single design matrix given a formula_like and data.

    :arg formula_like: An object that can be used to construct a design
      matrix. See below.
    :arg data: A dict-like object that can be used to look up variables
      referenced in `formula_like`.
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `formula_like` that cannot be
      found in `data`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`dmatrix` for lookups. If calling this function from a library,
      you probably want ``eval_env=1``, which means that variables should be
      resolved in *your* caller's namespace.
    :arg NA_action: What to do with rows that contain missing values. You can
      ``"drop"`` them, ``"raise"`` an error, or for customization, pass an
      :class:`NAAction` object. See :class:`NAAction` for details on what
      values count as 'missing' (and how to alter this).
    :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.

    The `formula_like` can take a variety of forms:

    * A formula string like "x1 + x2" (for :func:`dmatrix`) or "y ~ x1 + x2"
      (for :func:`dmatrices`). For details see :ref:`formulas`.
    * A :class:`ModelDesc`, which is a Python object representation of a
      formula. See :ref:`formulas` and :ref:`expert-model-specification` for
      details.
    * A :class:`DesignMatrixBuilder`.
    * An object that has a method called :meth:`__patsy_get_model_desc__`.
      For details see :ref:`expert-model-specification`.
    * A numpy array_like (for :func:`dmatrix`) or a tuple
      (array_like, array_like) (for :func:`dmatrices`). These will have
      metadata added, representation normalized, and then be returned
      directly. In this case `data` and `eval_env` are
      ignored. There is special handling for two cases:

      * :class:`DesignMatrix` objects will have their :class:`DesignInfo`
        preserved. This allows you to set up custom column names and term
        information even if you aren't using the rest of the patsy
        machinery.
      * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have
        their (row) indexes checked. If two are passed in, their indexes must
        be aligned. If ``return_type="dataframe"``, then their indexes will be
        preserved on the output.
      
    Regardless of the input, the return type is always either:

    * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default)
    * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``.

    The actual contents of the design matrix is identical in both cases, and
    in both cases a :class:`DesignInfo` will be available in a
    ``.design_info`` attribute on the return value. However, for
    ``return_type="dataframe"``, any pandas indexes on the input (either in
    `data` or directly passed through `formula_like`) will be
    preserved, which may be useful for e.g. time-series models.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action,
                                      return_type)
    if lhs.shape[1] != 0:
        raise PatsyError("encountered outcome variables for a model "
                         "that does not expect them")
    return rhs