def test_formula_factor_origin(): from patsy.origin import Origin desc = ModelDesc.from_formula("a + b", EvalEnvironment.capture(0)) assert (desc.rhs_termlist[1].factors[0].origin == Origin("a + b", 0, 1)) assert (desc.rhs_termlist[2].factors[0].origin == Origin("a + b", 4, 5))
def test_evalfactor_reraise(): # This will produce a PatsyError, but buried inside the factor evaluation, # so the original code has no way to give it an appropriate origin= # attribute. EvalFactor should notice this, and add a useful origin: def raise_patsy_error(x): raise PatsyError("WHEEEEEE") formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) except PatsyError as e: assert e.origin == Origin(formula, 0, formula.index(" ")) else: assert False # This will produce a KeyError, which on Python 3 we can do wrap without # destroying the traceback, so we do so. On Python 2 we let the original # exception escape. try: dmatrix("1 + x[1]", {"x": {}}) except Exception as e: if sys.version_info[0] >= 3: assert isinstance(e, PatsyError) assert e.origin == Origin("1 + x[1]", 4, 8) else: assert isinstance(e, KeyError) else: assert False
def _read_python_expr(it, end_tokens): # Read out a full python expression, stopping when we hit an # unnested end token. pytypes = [] token_strings = [] origins = [] bracket_level = 0 for pytype, token_string, origin in it: assert bracket_level >= 0 if bracket_level == 0 and token_string in end_tokens: it.push_back((pytype, token_string, origin)) break if token_string in ("(", "[", "{"): bracket_level += 1 if token_string in (")", "]", "}"): bracket_level -= 1 if bracket_level < 0: raise PatsyError("unmatched close bracket", origin) pytypes.append(pytype) token_strings.append(token_string) origins.append(origin) # Either we found an end_token, or we hit the end of the string if bracket_level == 0: expr_text = pretty_untokenize(zip(pytypes, token_strings)) if expr_text == "0": token_type = "ZERO" elif expr_text == "1": token_type = "ONE" elif _is_a(int, expr_text) or _is_a(float, expr_text): token_type = "NUMBER" else: token_type = "PYTHON_EXPR" return Token(token_type, Origin.combine(origins), extra=expr_text) else: raise PatsyError("unclosed bracket in embedded Python " "expression", Origin.combine(origins))
def test__tokenize_constraint(): code = "2 * (a + b) = q" tokens = _tokenize_constraint(code, ["a", "b", "q"]) expecteds = [("NUMBER", 0, 1, "2"), ("*", 2, 3, "*"), (Token.LPAREN, 4, 5, "("), ("VARIABLE", 5, 6, "a"), ("+", 7, 8, "+"), ("VARIABLE", 9, 10, "b"), (Token.RPAREN, 10, 11, ")"), ("=", 12, 13, "="), ("VARIABLE", 14, 15, "q")] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == Origin(code, expected[1], expected[2]) assert got.extra == expected[3] from nose.tools import assert_raises assert_raises(PatsyError, _tokenize_constraint, "1 + @b", ["b"]) # Shouldn't raise an error: _tokenize_constraint("1 + @b", ["@b"]) # Check we aren't confused by names which are proper prefixes of other # names: for names in (["a", "aa"], ["aa", "a"]): tokens = _tokenize_constraint("a aa a", names) assert len(tokens) == 3 assert [t.extra for t in tokens] == ["a", "aa", "a"] # Check that embedding ops and numbers inside a variable name works tokens = _tokenize_constraint("2 * a[1,1],", ["a[1,1]"]) assert len(tokens) == 4 assert [t.type for t in tokens] == ["NUMBER", "*", "VARIABLE", ","] assert [t.extra for t in tokens] == ["2", "*", "a[1,1]", ","]
def _read_op_context(token, c): if token.type == Token.RPAREN: if c.trace: print "Found close-paren" while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN: _run_op(c) if not c.op_stack: raise PatsyError("missing '(' or extra ')'", token) assert c.op_stack[-1].op.token_type == Token.LPAREN # Expand the origin of the item on top of the noun stack to include # the open and close parens: combined = Origin.combine([c.op_stack[-1].token, c.noun_stack[-1].token, token]) c.noun_stack[-1].origin = combined # Pop the open-paren c.op_stack.pop() return False elif token.type in c.binary_ops: if c.trace: print "Found binary operator %r" % (token.type) stackop = _StackOperator(c.binary_ops[token.type], token) while (c.op_stack and stackop.op.precedence <= c.op_stack[-1].op.precedence): _run_op(c) if c.trace: print "Pushing binary operator %r" % (token.type) c.op_stack.append(stackop) return True else: raise PatsyError("expected an operator, not '%s'" % (token.origin.relevant_code(),), token)
def _read_op_context(token, c): if token.type == Token.RPAREN: if c.trace: print("Found close-paren") while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN: _run_op(c) if not c.op_stack: raise PatsyError("missing '(' or extra ')'", token) assert c.op_stack[-1].op.token_type == Token.LPAREN # Expand the origin of the item on top of the noun stack to include # the open and close parens: combined = Origin.combine( [c.op_stack[-1].token, c.noun_stack[-1].token, token]) c.noun_stack[-1].origin = combined # Pop the open-paren c.op_stack.pop() return False elif token.type in c.binary_ops: if c.trace: print("Found binary operator %r" % (token.type)) stackop = _StackOperator(c.binary_ops[token.type], token) while (c.op_stack and stackop.op.precedence <= c.op_stack[-1].op.precedence): _run_op(c) if c.trace: print("Pushing binary operator %r" % (token.type)) c.op_stack.append(stackop) return True else: raise PatsyError( "expected an operator, not '%s'" % (token.origin.relevant_code(), ), token)
def _tokenize_constraint(string, variable_names): lparen_re = r"\(" rparen_re = r"\)" op_re = "|".join([re.escape(op.token_type) for op in _ops]) num_re = r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?" whitespace_re = r"\s+" # Prefer long matches: variable_names = sorted(variable_names, key=len, reverse=True) variable_re = "|".join([re.escape(n) for n in variable_names]) lexicon = [ (lparen_re, _token_maker(Token.LPAREN, string)), (rparen_re, _token_maker(Token.RPAREN, string)), (op_re, _token_maker("__OP__", string)), (variable_re, _token_maker("VARIABLE", string)), (num_re, _token_maker("NUMBER", string)), (whitespace_re, None), ] scanner = Scanner(lexicon) tokens, leftover = scanner.scan(string) if leftover: offset = len(string) - len(leftover) raise PatsyError("unrecognized token in constraint", Origin(string, offset, offset + 1)) return tokens
def make_token(scanner, token_string): if type == "__OP__": actual_type = token_string else: actual_type = type return Token(actual_type, Origin(string, *scanner.match.span()), token_string)
def python_tokenize(code): # Since formulas can only contain Python expressions, and Python # expressions cannot meaningfully contain newlines, we'll just remove all # the newlines up front to avoid any complications: code = code.replace("\n", " ").strip() it = tokenize.generate_tokens(StringIO(code).readline) try: for (pytype, string, (_, start), (_, end), code) in it: if pytype == tokenize.ENDMARKER: break origin = Origin(code, start, end) assert pytype != tokenize.NL if pytype == tokenize.NEWLINE: assert string == "" continue if pytype == tokenize.ERRORTOKEN: raise PatsyError("error tokenizing input " "(maybe an unclosed string?)", origin) if pytype == tokenize.COMMENT: raise PatsyError("comments are not allowed", origin) yield (pytype, string, origin) else: # pragma: no cover raise ValueError("stream ended without ENDMARKER?!?") except tokenize.TokenError as e: # TokenError is raised iff the tokenizer thinks that there is # some sort of multi-line construct in progress (e.g., an # unclosed parentheses, which in Python lets a virtual line # continue past the end of the physical line), and it hits the # end of the source text. We have our own error handling for # such cases, so just treat this as an end-of-stream. # # Just in case someone adds some other error case: assert e.args[0].startswith("EOF in multi-line") return
def test_parse_origin(): tree = parse_formula("a ~ b + c") assert tree.origin == Origin("a ~ b + c", 0, 9) assert tree.token.origin == Origin("a ~ b + c", 2, 3) assert tree.args[0].origin == Origin("a ~ b + c", 0, 1) assert tree.args[1].origin == Origin("a ~ b + c", 4, 9) assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7) assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5) assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
def _run_op(c): assert c.op_stack stackop = c.op_stack.pop() args = [] for i in range(stackop.op.arity): args.append(c.noun_stack.pop()) args.reverse() if c.trace: print("Reducing %r (%r)" % (stackop.op.token_type, args)) node = ParseNode(stackop.op.token_type, stackop.token, args, Origin.combine([stackop.token] + args)) c.noun_stack.append(node)
def test_evalfactor_reraise(): # This will produce a PatsyError, but buried inside the factor evaluation, # so the original code has no way to give it an appropriate origin= # attribute. EvalFactor should notice this, and add a useful origin: def raise_patsy_error(x): raise PatsyError("WHEEEEEE") formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) except PatsyError, e: assert e.origin == Origin(formula, 0, formula.index(" "))
def _run_op(c): assert c.op_stack stackop = c.op_stack.pop() args = [] for i in xrange(stackop.op.arity): args.append(c.noun_stack.pop()) args.reverse() if c.trace: print "Reducing %r (%r)" % (stackop.op.token_type, args) node = ParseNode(stackop.op.token_type, stackop.token, args, Origin.combine([stackop.token] + args)) c.noun_stack.append(node)
def test_NAAction_raise(): action = NAAction(on_NA="raise") # no-NA just passes through: in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1, 2])] is_NAs = [np.asarray([False, False])] * 2 got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None]) assert np.array_equal(got_arrs[0], in_arrs[0]) assert np.array_equal(got_arrs[1], in_arrs[1]) from patsy.origin import Origin o1 = Origin("asdf", 0, 1) o2 = Origin("asdf", 2, 3) # NA raises an error with a correct origin in_idx = np.arange(2) in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1.0, np.nan])] is_NAs = [np.asarray([False, False]), np.asarray([False, True])] try: action.handle_NA(in_arrs, is_NAs, [o1, o2]) assert False except PatsyError as e: assert e.origin is o2
def test_infix_parse(): ops = [Operator("+", 2, 10), Operator("*", 2, 20), Operator("-", 1, 30)] atomic = ["ATOM1", "ATOM2"] # a + -b * (c + d) mock_origin = Origin("asdf", 2, 3) tokens = [ Token("ATOM1", mock_origin, "a"), Token("+", mock_origin, "+"), Token("-", mock_origin, "-"), Token("ATOM2", mock_origin, "b"), Token("*", mock_origin, "*"), Token(Token.LPAREN, mock_origin, "("), Token("ATOM1", mock_origin, "c"), Token("+", mock_origin, "+"), Token("ATOM2", mock_origin, "d"), Token(Token.RPAREN, mock_origin, ")") ] tree = infix_parse(tokens, ops, atomic) def te(tree, type, extra): assert tree.type == type assert tree.token.extra == extra te(tree, "+", "+") te(tree.args[0], "ATOM1", "a") assert tree.args[0].args == [] te(tree.args[1], "*", "*") te(tree.args[1].args[0], "-", "-") assert len(tree.args[1].args[0].args) == 1 te(tree.args[1].args[0].args[0], "ATOM2", "b") te(tree.args[1].args[1], "+", "+") te(tree.args[1].args[1].args[0], "ATOM1", "c") te(tree.args[1].args[1].args[1], "ATOM2", "d") import pytest # No ternary ops pytest.raises(ValueError, infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"]) # smoke test just to make sure there are no egregious bugs in 'trace' infix_parse(tokens, ops, atomic, trace=True)
formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) except PatsyError, e: assert e.origin == Origin(formula, 0, formula.index(" ")) else: assert False # This will produce a KeyError, which on Python 3 we can do wrap without # destroying the traceback, so we do so. On Python 2 we let the original # exception escape. try: dmatrix("1 + x[1]", {"x": {}}) except Exception, e: if sys.version_info[0] >= 3: assert isinstance(e, PatsyError) assert e.origin == Origin("1 + x[1]", 4, 8) else: assert isinstance(e, KeyError) else: assert False def test_dmatrix_NA_action(): data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]} mat = dmatrix("x + y", data=data) assert np.array_equal(mat, [[1, 2, 20], [1, 3, 30]]) assert_raises(PatsyError, dmatrix, "x + y", data=data, NA_action="raise") lmat, rmat = dmatrices("y ~ x", data=data) assert np.array_equal(lmat, [[20], [30]])
def __init__(self): # You should check this using 'is', not '==' from patsy.origin import Origin self.origin = Origin("MOCK", 1, 2)
def test_python_tokenize(): code = "a + (foo * -1)" tokens = list(python_tokenize(code)) expected = [(tokenize.NAME, "a", Origin(code, 0, 1)), (tokenize.OP, "+", Origin(code, 2, 3)), (tokenize.OP, "(", Origin(code, 4, 5)), (tokenize.NAME, "foo", Origin(code, 5, 8)), (tokenize.OP, "*", Origin(code, 9, 10)), (tokenize.OP, "-", Origin(code, 11, 12)), (tokenize.NUMBER, "1", Origin(code, 12, 13)), (tokenize.OP, ")", Origin(code, 13, 14))] assert tokens == expected code2 = "a + (b" tokens2 = list(python_tokenize(code2)) expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)), (tokenize.OP, "+", Origin(code2, 2, 3)), (tokenize.OP, "(", Origin(code2, 4, 5)), (tokenize.NAME, "b", Origin(code2, 5, 6))] assert tokens2 == expected2 import pytest pytest.raises(PatsyError, list, python_tokenize("a b # c")) import pytest pytest.raises(PatsyError, list, python_tokenize("a b \"c"))
def test__tokenize_formula(): code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10" tokens = list(_tokenize_formula(code, ["+", "-", "~"])) expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"), ("~", Origin(code, 2, 3), None), ("PYTHON_EXPR", Origin(code, 4, 5), "a"), ("+", Origin(code, 6, 7), None), (Token.LPAREN, Origin(code, 8, 9), None), ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), (Token.RPAREN, Origin(code, 23, 24), None), ("+", Origin(code, 25, 26), None), ("-", Origin(code, 27, 28), None), ("ONE", Origin(code, 28, 29), "1"), ("+", Origin(code, 30, 31), None), ("ZERO", Origin(code, 32, 33), "0"), ("+", Origin(code, 34, 35), None), ("NUMBER", Origin(code, 36, 38), "10"), ] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == expected[1] assert got.extra == expected[2]