Esempio n. 1
0
def python_tokenize(code):
    # Since formulas can only contain Python expressions, and Python
    # expressions cannot meaningfully contain newlines, we'll just remove all
    # the newlines up front to avoid any complications:
    code = code.replace("\n", " ").strip()
    it = tokenize.generate_tokens(StringIO(code).readline)
    try:
        for (pytype, string, (_, start), (_, end), code) in it:
            if pytype == tokenize.ENDMARKER:
                break
            origin = Origin(code, start, end)
            assert pytype != tokenize.NL
            if pytype == tokenize.NEWLINE:
                assert string == ""
                continue
            if pytype == tokenize.ERRORTOKEN:
                raise PatsyError("error tokenizing input "
                                 "(maybe an unclosed string?)",
                                 origin)
            if pytype == tokenize.COMMENT:
                raise PatsyError("comments are not allowed", origin)
            yield (pytype, string, origin)
        else: # pragma: no cover
            raise ValueError("stream ended without ENDMARKER?!?")
    except tokenize.TokenError as e:
        # TokenError is raised iff the tokenizer thinks that there is
        # some sort of multi-line construct in progress (e.g., an
        # unclosed parentheses, which in Python lets a virtual line
        # continue past the end of the physical line), and it hits the
        # end of the source text. We have our own error handling for
        # such cases, so just treat this as an end-of-stream.
        #
        # Just in case someone adds some other error case:
        assert e.args[0].startswith("EOF in multi-line")
        return
Esempio n. 2
0
def _eval_factor(factor_info, data, NA_action):
    factor = factor_info.factor
    result = factor.eval(factor_info.state, data)
    # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask
    if factor_info.type == "numerical":
        result = atleast_2d_column_default(result, preserve_pandas=True)
        _max_allowed_dim(2, result, factor)
        if result.shape[1] != factor_info.num_columns:
            raise PatsyError("when evaluating factor %s, I got %s columns "
                                "instead of the %s I was expecting"
                                % (factor.name(),
                                   factor_info.num_columns,
                                   result.shape[1]),
                                factor)
        if not safe_issubdtype(np.asarray(result).dtype, np.number):
            raise PatsyError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (factor.name(), result.dtype),
                             factor)
        return result, NA_action.is_numerical_NA(result)
    # returns either a 1d ndarray or a pandas.Series, plus is_NA mask
    else:
        assert factor_info.type == "categorical"
        result = categorical_to_int(result, factor_info.categories, NA_action,
                                    origin=factor_info.factor)
        assert result.ndim == 1
        return result, np.asarray(result == -1)
Esempio n. 3
0
def _read_op_context(token, c):
    if token.type == Token.RPAREN:
        if c.trace:
            print("Found close-paren")
        while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
            _run_op(c)
        if not c.op_stack:
            raise PatsyError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].op.token_type == Token.LPAREN
        # Expand the origin of the item on top of the noun stack to include
        # the open and close parens:
        combined = Origin.combine(
            [c.op_stack[-1].token, c.noun_stack[-1].token, token])
        c.noun_stack[-1].origin = combined
        # Pop the open-paren
        c.op_stack.pop()
        return False
    elif token.type in c.binary_ops:
        if c.trace:
            print("Found binary operator %r" % (token.type))
        stackop = _StackOperator(c.binary_ops[token.type], token)
        while (c.op_stack
               and stackop.op.precedence <= c.op_stack[-1].op.precedence):
            _run_op(c)
        if c.trace:
            print("Pushing binary operator %r" % (token.type))
        c.op_stack.append(stackop)
        return True
    else:
        raise PatsyError(
            "expected an operator, not '%s'" %
            (token.origin.relevant_code(), ), token)
def _get_level(levels, level_ref):
    if level_ref in levels:
        return levels.index(level_ref)
    if isinstance(level_ref, six.integer_types):
        if level_ref < 0:
            level_ref += len(levels)
        if not (0 <= level_ref < len(levels)):
            raise PatsyError("specified level %r is out of range"
                                % (level_ref,))
        return level_ref
    raise PatsyError("specified level %r not found" % (level_ref,))
Esempio n. 5
0
def dmatrices(formula_like,
              data={},
              eval_env=0,
              NA_action="drop",
              return_type="matrix"):
    """Construct two design matrices given a formula_like and data.

    This function is identical to :func:`dmatrix`, except that it requires
    (and returns) two matrices instead of one. By convention, the first matrix
    is the "outcome" or "y" data, and the second is the "predictor" or "x"
    data.

    
    it requires the
    formula to specify both a left-hand side outcome matrix and a right-hand
    side predictors matrix, which are returned as a tuple.

    See :func:`dmatrix` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, NA_action,
                                      return_type)
    if lhs.shape[1] == 0:
        raise PatsyError("model is missing required outcome variables")
    return (lhs, rhs)
Esempio n. 6
0
 def _code_either(self, intercept, levels):
     n = len(levels)
     scores = self.scores
     if scores is None:
         scores = np.arange(n)
     scores = np.asarray(scores, dtype=float)
     if len(scores) != n:
         raise PatsyError("number of levels (%s) does not match"
                             " number of scores (%s)"
                             % (n, len(scores)))
     # Strategy: just make a matrix whose columns are naive linear,
     # quadratic, etc., functions of the raw scores, and then use 'qr' to
     # orthogonalize each column against those to its left.
     scores -= scores.mean()
     raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))
     q, r = np.linalg.qr(raw_poly)
     q *= np.sign(np.diag(r))
     q /= np.sqrt(np.sum(q ** 2, axis=1))
     # The constant term is always all 1's -- we don't normalize it.
     q[:, 0] = 1
     names = [".Constant", ".Linear", ".Quadratic", ".Cubic"]
     names += ["^%s" % (i,) for i in xrange(4, n)]
     names = names[:n]
     if intercept:
         return ContrastMatrix(q, names)
     else:
         # We always include the constant/intercept column as something to
         # orthogonalize against, but we don't always return it:
         return ContrastMatrix(q[:, 1:], names[1:])
Esempio n. 7
0
def _tokenize_constraint(string, variable_names):
    lparen_re = r"\("
    rparen_re = r"\)"
    op_re = "|".join([re.escape(op.token_type) for op in _ops])
    num_re = r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?"
    whitespace_re = r"\s+"

    # Prefer long matches:
    variable_names = sorted(variable_names, key=len, reverse=True)
    variable_re = "|".join([re.escape(n) for n in variable_names])

    lexicon = [
        (lparen_re, _token_maker(Token.LPAREN, string)),
        (rparen_re, _token_maker(Token.RPAREN, string)),
        (op_re, _token_maker("__OP__", string)),
        (variable_re, _token_maker("VARIABLE", string)),
        (num_re, _token_maker("NUMBER", string)),
        (whitespace_re, None),
        ]

    scanner = Scanner(lexicon)
    tokens, leftover = scanner.scan(string)
    if leftover:
        offset = len(string) - len(leftover)
        raise PatsyError("unrecognized token in constraint",
                            Origin(string, offset, offset + 1))

    return tokens
Esempio n. 8
0
 def _eval_binary_div(self, tree):
     left = self.eval(tree.args[0])
     right = self.eval(tree.args[1])
     if not self.is_constant(right):
         raise PatsyError("Can't divide by a variable in a linear "
                             "constraint", tree.args[1])
     return left / right[-1]
Esempio n. 9
0
 def eval(self, data, NA_action):
     result = self.factor.eval(self._state, data)
     result = atleast_2d_column_default(result, preserve_pandas=True)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise PatsyError(
             "when evaluating factor %s, I got %s columns "
             "instead of the %s I was expecting" %
             (self.factor.name(), self._expected_columns, result.shape[1]),
             self.factor)
     if not np.issubdtype(np.asarray(result).dtype, np.number):
         raise PatsyError(
             "when evaluating numeric factor %s, "
             "I got non-numeric data of type '%s'" %
             (self.factor.name(), result.dtype), self.factor)
     return result, NA_action.is_numerical_NA(result)
Esempio n. 10
0
 def sniff(self, data):
     if hasattr(data, "contrast"):
         self._contrast = data.contrast
     # returns a bool: are we confident that we found all the levels?
     if have_pandas_categorical and isinstance(data, pandas.Categorical):
         # pandas.Categorical has its own NA detection, so don't try to
         # second-guess it.
         self._levels = tuple(data.levels)
         return True
     if isinstance(data, _CategoricalBox):
         if data.levels is not None:
             self._levels = tuple(data.levels)
             return True
         else:
             # unbox and fall through
             data = data.data
     for value in data:
         if self._NA_action.is_categorical_NA(value):
             continue
         if value is True or value is False:
             self._level_set.update([True, False])
         else:
             try:
                 self._level_set.add(value)
             except TypeError:
                 raise PatsyError(
                     "Error interpreting categorical data: "
                     "all items must be hashable", self._origin)
     # If everything we've seen is boolean, assume that everything else
     # would be too. Otherwise we need to keep looking.
     return self._level_set == set([True, False])
Esempio n. 11
0
def _eval_unary_minus(evaluator, tree):
    if tree.args[0].type == "ZERO":
        return IntermediateExpr(True, tree.origin, False, [])
    elif tree.args[0].type == "ONE":
        return IntermediateExpr(False, None, True, [])
    else:
        raise PatsyError("Unary minus can only be applied to 1 or 0", tree)
Esempio n. 12
0
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
    """Construct a design matrix builder incrementally from a large data set.

    :arg formula_like: Similar to :func:`dmatrix`, except that explicit
      matrices are not allowed. Must be a formula string, a
      :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a
      ``__patsy_get_model_desc__`` method.
    :arg data_iter_maker: A zero-argument callable which returns an iterator
      over dict-like data objects. This must be a callable rather than a
      simple iterator because sufficiently complex formulas may require
      multiple passes over the data (e.g. if there are nested stateful
      transforms).
    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
      look up any variables referenced in `formula_like` that cannot be
      found in `data`, or else a depth represented as an
      integer which will be passed to :meth:`EvalEnvironment.capture`.
      ``eval_env=0`` means to use the context of the function calling
      :func:`incr_dbuilder` for lookups. If calling this function from a
      library, you probably want ``eval_env=1``, which means that variables
      should be resolved in *your* caller's namespace.
    :arg NA_action: An :class:`NAAction` object or string, used to determine
      what values count as 'missing' for purposes of determining the levels of
      categorical factors.
    :returns: A :class:`DesignInfo`

    Tip: for `data_iter_maker`, write a generator like::

      def iter_maker():
          for data_chunk in my_data_store:
              yield data_chunk

    and pass `iter_maker` (*not* `iter_maker()`).

    .. versionadded:: 0.2.0
       The ``NA_action`` argument.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                      NA_action)
    if design_infos is None:
        raise PatsyError("bad formula-like object")
    if len(design_infos[0].column_names) > 0:
        raise PatsyError("encountered outcome variables for a model "
                         "that does not expect them")
    return design_infos[1]
Esempio n. 13
0
def incr_dbuilders(formula_like,
                   data_iter_maker,
                   eval_env=0,
                   NA_action="drop"):
    """Construct two design matrix builders incrementally from a large data
    set.

    :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is
    to :func:`dmatrix`. See :func:`incr_dbuilder` for details.
    """
    eval_env = EvalEnvironment.capture(eval_env, reference=1)
    builders = _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                  NA_action)
    if builders is None:
        raise PatsyError("bad formula-like object")
    if len(builders[0].design_info.column_names) == 0:
        raise PatsyError("model is missing required outcome variables")
    return builders
Esempio n. 14
0
 def _handle_raise(self, values, is_NAs, origins):
     for is_NA, origin in zip(is_NAs, origins):
         if np.any(is_NA):
             msg = "Missing values detected. If you want rows with missing "\
                   "values to be automatically deleted in a list-wise " \
                   "manner (not recommended), please set dropna=True in " \
                   "the bambi Model initialization."
             raise PatsyError(msg, origin)
     return values
Esempio n. 15
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1
    if isinstance(data, pd.Categorical):
        data_levels_tuple = tuple(data.levels)
        if not data_levels_tuple == levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, data_levels_tuple), origin)
        # pd.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return data.labels
    elif hasattr(data, 'dtype') and hasattr(data, 'astype') and \
            np.issubdtype(data.dtype, np.bool_):
        return data.astype('int')
    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, tuple(data.levels)), origin)
        data = data.data
    if hasattr(data, "shape") and len(data.shape) > 1:
        raise PatsyError("categorical data must be 1-dimensional",
                         origin)
    if not iterable(data) or isinstance(data, basestring):
        raise PatsyError("categorical data must be an iterable container")
    try:
        level_to_int = dict(zip(levels, xrange(len(levels))))
    except TypeError:
        raise PatsyError("Error interpreting categorical data: "
                         "all items must be hashable", origin)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [repr(level)
                                   for level in levels[:SHOW_LEVELS//2]]
                    level_strs.append("...")
                    level_strs += [repr(level)
                                   for level in levels[-SHOW_LEVELS//2:]]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError("Error converting data to categorical: "
                                 "observation with value %r does not match "
                                 "any of the expected levels (expected: %s)"
                                 % (value, level_str), origin)
            except TypeError:
                raise PatsyError("Error converting data to categorical: "
                                 "encountered unhashable value %r"
                                 % (value,), origin)
    if isinstance(data, pd.Series):
        out = pd.Series(out, index=data.index)
    return out
Esempio n. 16
0
 def _eval_binary_multiply(self, tree):
     left = self.eval(tree.args[0])
     right = self.eval(tree.args[1])
     if self.is_constant(left):
         return left[-1] * right
     elif self.is_constant(right):
         return left * right[-1]
     else:
         raise PatsyError("Can't multiply one variable by another "
                             "in a linear constraint", tree)
Esempio n. 17
0
 def eval(self, tree, require_evalexpr=True):
     result = None
     assert isinstance(tree, ParseNode)
     key = (tree.type, len(tree.args))
     if key not in self._evaluators:
         raise PatsyError("I don't know how to evaluate this "
                             "'%s' operator" % (tree.type,),
                             tree.token)
     result = self._evaluators[key](self, tree)
     if require_evalexpr and not isinstance(result, IntermediateExpr):
         if isinstance(result, ModelDesc):
             raise PatsyError("~ can only be used once, and "
                                 "only at the top level",
                                 tree)
         else:
             raise PatsyError("custom operator returned an "
                                 "object that I don't know how to "
                                 "handle", tree)
     return result
Esempio n. 18
0
def infix_parse(tokens, operators, atomic_types, trace=False):
    token_source = iter(tokens)

    unary_ops = {}
    binary_ops = {}
    for op in operators:
        assert op.precedence > _open_paren.precedence
        if op.arity == 1:
            unary_ops[op.token_type] = op
        elif op.arity == 2:
            binary_ops[op.token_type] = op
        else:
            raise ValueError("operators must be unary or binary")

    c = _ParseContext(unary_ops, binary_ops, atomic_types, trace)

    # This is an implementation of Dijkstra's shunting yard algorithm:
    #   http://en.wikipedia.org/wiki/Shunting_yard_algorithm
    #   http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm

    want_noun = True
    for token in token_source:
        if c.trace:
            print("Reading next token (want_noun=%r)" % (want_noun, ))
        if want_noun:
            want_noun = _read_noun_context(token, c)
        else:
            want_noun = _read_op_context(token, c)
    if c.trace:
        print("End of token stream")

    if want_noun:
        raise PatsyError("expected a noun, but instead the expression ended",
                         c.op_stack[-1].token.origin)

    while c.op_stack:
        if c.op_stack[-1].op.token_type == Token.LPAREN:
            raise PatsyError("Unmatched '('", c.op_stack[-1].token)
        _run_op(c)

    assert len(c.noun_stack) == 1
    return c.noun_stack.pop()
Esempio n. 19
0
def demo_data(*names, **kwargs):
    """demo_data(*names, nlevels=2, min_rows=5)

    Create simple categorical/numerical demo data.

    Pass in a set of variable names, and this function will return a simple
    data set using those variable names.

    Names whose first letter falls in the range "a" through "m" will be made
    categorical (with `nlevels` levels). Those that start with a "p" through
    "z" are numerical.

    We attempt to produce a balanced design on the categorical variables,
    repeating as necessary to generate at least `min_rows` data
    points. Categorical variables are returned as a list of strings.

    Numerical data is generated by sampling from a normal distribution. A
    fixed random seed is used, so that identical calls to demo_data() will
    produce identical results. Numerical data is returned in a numpy array.

    Example:

    .. ipython:

       In [1]: patsy.demo_data("a", "b", "x", "y")
       Out[1]:
       {'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'],
        'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2'],
        'x': array([ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,
                     1.86755799, -0.97727788,  0.95008842, -0.15135721]),
        'y': array([-0.10321885,  0.4105985 ,  0.14404357,  1.45427351,
                     0.76103773,  0.12167502,  0.44386323,  0.33367433])}
    """
    nlevels = kwargs.pop("nlevels", 2)
    min_rows = kwargs.pop("min_rows", 5)
    if kwargs:
        raise TypeError("unexpected keyword arguments %r" % (kwargs, ))
    numerical = set()
    categorical = {}
    for name in names:
        if name[0] in "abcdefghijklmn":
            categorical[name] = nlevels
        elif name[0] in "pqrstuvwxyz":
            numerical.add(name)
        else:
            raise PatsyError("bad name %r" % (name, ))
    balanced_design_size = np.prod(list(categorical.values()), dtype=int)
    repeat = int(np.ceil(min_rows * 1.0 / balanced_design_size))
    num_rows = repeat * balanced_design_size
    data = balanced(repeat=repeat, **categorical)
    r = np.random.RandomState(0)
    for name in sorted(numerical):
        data[name] = r.normal(size=num_rows)
    return data
Esempio n. 20
0
 def eval(self, tree, constraint=False):
     key = (tree.type, len(tree.args))
     assert key in self._dispatch
     val = self._dispatch[key](tree)
     if constraint:
         # Force it to be a constraint
         if isinstance(val, LinearConstraint):
             return val
         else:
             assert val.size == self._N + 1
             if np.all(val[:self._N] == 0):
                 raise PatsyError("term is constant, with no variables",
                                  tree)
             return LinearConstraint(self._variable_names, val[:self._N],
                                     -val[-1])
     else:
         # Force it to *not* be a constraint
         if isinstance(val, LinearConstraint):
             raise PatsyError("unexpected constraint object", tree)
         return val
Esempio n. 21
0
def _categorical_shape_fix(data):
    # helper function
    # data should not be a _CategoricalBox or pandas Categorical or anything
    # -- it should be an actual iterable of data, but which might have the
    # wrong shape.
    if hasattr(data, "ndim") and data.ndim > 1:
        raise PatsyError("categorical data cannot be >1-dimensional")
    # coerce scalars into 1d, which is consistent with what we do for numeric
    # factors. (See statsmodels/statsmodels#1881)
    if (not iterable(data) or isinstance(data,
                                         (six.text_type, six.binary_type))):
        data = [data]
    return data
Esempio n. 22
0
 def check(self, seen_value, desc, origin):
     if self.value is None:
         self.value = seen_value
         self._value_desc = desc
         self._value_origin = origin
     else:
         if not self._eq_fn(self.value, seen_value):
             msg = ("%s mismatch between %s and %s"
                    % (self._name, self._value_desc, desc))
             if isinstance(self.value, int):
                 msg += " (%r versus %r)" % (self.value, seen_value)
             # XX FIXME: this is a case where having discontiguous Origins
             # would be useful...
             raise PatsyError(msg, origin)
Esempio n. 23
0
def _read_python_expr(it, end_tokens):
    # Read out a full python expression, stopping when we hit an
    # unnested end token.
    pytypes = []
    token_strings = []
    origins = []
    bracket_level = 0
    for pytype, token_string, origin in it:
        assert bracket_level >= 0
        if bracket_level == 0 and token_string in end_tokens:
            it.push_back((pytype, token_string, origin))
            break
        if token_string in ("(", "[", "{"):
            bracket_level += 1
        if token_string in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise PatsyError("unmatched close bracket", origin)
        pytypes.append(pytype)
        token_strings.append(token_string)
        origins.append(origin)
    # Either we found an end_token, or we hit the end of the string
    if bracket_level == 0:
        expr_text = pretty_untokenize(zip(pytypes, token_strings))
        if expr_text == "0":
            token_type = "ZERO"
        elif expr_text == "1":
            token_type = "ONE"
        elif _is_a(int, expr_text) or _is_a(float, expr_text):
            token_type = "NUMBER"
        else:
            token_type = "PYTHON_EXPR"
        return Token(token_type, Origin.combine(origins), extra=expr_text)
    else:
        raise PatsyError("unclosed bracket in embedded Python "
                            "expression",
                            Origin.combine(origins))
Esempio n. 24
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env,
                                       NA_action)[0], formula_like)
    if (isinstance(formula_like, tuple) and len(formula_like) == 2
            and isinstance(formula_like[0], DesignInfo)
            and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__" %
                             (formula_like, ))
        # fallthrough
    if not six.PY3 and isinstance(formula_like, unicode):
        # Included for the convenience of people who are using py2 with
        # __future__.unicode_literals.
        try:
            formula_like = formula_like.encode("ascii")
        except UnicodeEncodeError:
            raise PatsyError(
                "On Python 2, formula strings must be either 'str' objects, "
                "or else 'unicode' objects containing only ascii "
                "characters. You passed a unicode string with non-ascii "
                "characters. I'm afraid you'll have to either switch to "
                "ascii-only, or else upgrade to Python 3.")
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders(
            [formula_like.lhs_termlist, formula_like.rhs_termlist],
            data_iter_maker, eval_env, NA_action)
    else:
        return None
Esempio n. 25
0
def call_and_wrap_exc(msg, origin, f, *args, **kwargs):
    try:
        return f(*args, **kwargs)
    except Exception, e:
        if sys.version_info[0] >= 3:
            new_exc = PatsyError("%s: %s: %s" % (msg, e.__class__.__name__, e),
                                 origin)
            # Use 'exec' to hide this syntax from the Python 2 parser:
            exec("raise new_exc from e")
        else:
            # In python 2, we just let the original exception escape -- better
            # than destroying the traceback. But if it's a PatsyError, we can
            # at least set the origin properly.
            if isinstance(e, PatsyError):
                e.set_origin(origin)
            raise
Esempio n. 26
0
 def build(self, factor_values, out):
     assert self.total_columns == out.shape[1]
     out[:] = 1
     for i, column_idxs in enumerate(_column_combinations(self._columns_per_factor)):
         for factor, column_idx in zip(self._factors, column_idxs):
             if factor in self._cat_contrasts:
                 contrast = self._cat_contrasts[factor]
                 if np.any(factor_values[factor] < 0):
                     raise PatsyError("can't build a design matrix "
                                      "containing missing values", factor)
                 out[:, i] *= contrast.matrix[factor_values[factor],
                                              column_idx]
             else:
                 assert (factor_values[factor].shape[1]
                         == self._num_columns[factor])
                 out[:, i] *= factor_values[factor][:, column_idx]
Esempio n. 27
0
    def slice(self, columns_specifier):
        """Locate a subset of design matrix columns, specified symbolically.

        A patsy design matrix has two levels of structure: the individual
        columns (which are named), and the :ref:`terms <formulas>` in
        the formula that generated those columns. This is a one-to-many
        relationship: a single term may span several columns. This method
        provides a user-friendly API for locating those columns.

        (While we talk about columns here, this is probably most useful for
        indexing into other arrays that are derived from the design matrix,
        such as regression coefficients or covariance matrices.)

        The `columns_specifier` argument can take a number of forms:

        * A term name
        * A column name
        * A :class:`Term` object
        * An integer giving a raw index
        * A raw slice object

        In all cases, a Python :func:`slice` object is returned, which can be
        used directly for indexing.

        Example::

          y, X = dmatrices("y ~ a", demo_data("y", "a", nlevels=3))
          betas = np.linalg.lstsq(X, y)[0]
          a_betas = betas[X.design_info.slice("a")]

        (If you want to look up a single individual column by name, use
        ``design_info.column_name_indexes[name]``.)
        """
        if isinstance(columns_specifier, slice):
            return columns_specifier
        if np.issubsctype(type(columns_specifier), np.integer):
            return slice(columns_specifier, columns_specifier + 1)
        if (self.term_slices is not None
                and columns_specifier in self.term_slices):
            return self.term_slices[columns_specifier]
        if columns_specifier in self.term_name_slices:
            return self.term_name_slices[columns_specifier]
        if columns_specifier in self.column_name_indexes:
            idx = self.column_name_indexes[columns_specifier]
            return slice(idx, idx + 1)
        raise PatsyError("unknown column specified '%s'" %
                         (columns_specifier, ))
Esempio n. 28
0
def ast_names(code):
    """Iterator that yields all the (ast) names in a Python expression.

    :arg code: A string containing a Python expression.
    """
    # Syntax that allows new name bindings to be introduced is tricky to
    # handle here, so we just refuse to do so.
    disallowed_ast_nodes = (ast.Lambda, ast.ListComp, ast.GeneratorExp)
    if sys.version_info >= (2, 7):
        disallowed_ast_nodes += (ast.DictComp, ast.SetComp)

    for node in ast.walk(ast.parse(code)):
        if isinstance(node, disallowed_ast_nodes):
            raise PatsyError("Lambda, list/dict/set comprehension, generator "
                             "expression in patsy formula not currently supported.")
        if isinstance(node, ast.Name):
            yield node.id
Esempio n. 29
0
def _build_subterm(subterm, factor_infos, factor_values, out):
    assert subterm.num_columns == out.shape[1]
    out[...] = 1
    for i, column_idxs in enumerate(
            _subterm_column_combinations(factor_infos, subterm)):
        for factor, column_idx in zip(subterm.factors, column_idxs):
            if factor_infos[factor].type == "categorical":
                contrast = subterm.contrast_matrices[factor]
                if np.any(factor_values[factor] < 0):
                    raise PatsyError("can't build a design matrix "
                                     "containing missing values", factor)
                out[:, i] *= contrast.matrix[factor_values[factor],
                                             column_idx]
            else:
                assert factor_infos[factor].type == "numerical"
                assert (factor_values[factor].shape[1]
                        == factor_infos[factor].num_columns)
                out[:, i] *= factor_values[factor][:, column_idx]
Esempio n. 30
0
 def _eval_binary_eq(self, tree):
     # Handle "a1 = a2 = a3", which is parsed as "(a1 = a2) = a3"
     args = list(tree.args)
     constraints = []
     for i, arg in enumerate(args):
         if arg.type == "=":
             constraints.append(self.eval(arg, constraint=True))
             # make our left argument be their right argument, or
             # vice-versa
             args[i] = arg.args[1 - i]
     left = self.eval(args[0])
     right = self.eval(args[1])
     coefs = left[:self._N] - right[:self._N]
     if np.all(coefs == 0):
         raise PatsyError("no variables appear in constraint", tree)
     constant = -left[-1] + right[-1]
     constraint = LinearConstraint(self._variable_names, coefs, constant)
     constraints.append(constraint)
     return LinearConstraint.combine(constraints)