def _read_quoted_string(string, i): start = i quote_type = string[i] assert quote_type in "\"'`" chars = [] i += 1 while i < len(string): char = string[i] if char == quote_type: break elif char == "\\": # Consume the backslash i += 1 if i >= len(string): break escaped_char = string[i] if escaped_char in "\"'`\\": chars.append(escaped_char) else: raise EventsError( "unrecognized escape sequence \\%s" % (escaped_char, ), Origin(string, i - 1, i)) else: chars.append(string[i]) i += 1 if i >= len(string): raise EventsError("unclosed string", Origin(string, start, i)) assert string[i] == quote_type i += 1 if quote_type == "`": type = "ATTR" else: type = "LITERAL" return Token(type, Origin(string, start, i), "".join(chars)), i
def test_issue_11(): # Give a sensible error message for level mismatches # (At some points we've failed to put an origin= on these errors) env = EvalEnvironment.capture() data = {"X": [0, 1, 2, 3], "Y": [1, 2, 3, 4]} formula = "C(X) + Y" new_data = {"X": [0, 0, 1, 2, 3, 3, 4], "Y": [1, 2, 3, 4, 5, 6, 7]} info = dmatrix(formula, data) try: build_design_matrices([info.design_info.builder], new_data) except PatsyError, e: assert e.origin == Origin(formula, 0, 4)
def _tokenize(string): punct_op_tokens = [op.token_type for op in _punct_ops] punct_op_tokens.sort(key=len, reverse=True) text_op_tokens = [op.token_type for op in _text_ops] num_re = re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?") # This works because \w matches underscore, letters, and digits. # But if a token starts with a digit, then it'll be caught by num_re above # first, so in fact this works like "[_a-z][_a-z0-9]*" except for being # unicode-enabled. ident_re = re.compile(r"\w+", re.IGNORECASE | re.UNICODE) whitespace_re = re.compile(r"\s+") i = 0 while i < len(string): if "(" == string[i]: yield Token(Token.LPAREN, Origin(string, i, i + 1)) i += 1 continue if ")" == string[i]: yield Token(Token.RPAREN, Origin(string, i, i + 1)) i += 1 continue if string[i] in "\"'`": token, i = _read_quoted_string(string, i) yield token continue match = num_re.match(string, i) if match is not None: try: value = int(match.group()) except ValueError: value = float(match.group()) yield Token("LITERAL", Origin(string, *match.span()), value) i = match.end() continue match = ident_re.match(string, i) if match is not None: token = match.group() origin = Origin(string, *match.span()) if token in text_op_tokens: yield Token(token, origin) elif token.lower() == "true": yield Token("LITERAL", origin, True) elif token.lower() == "false": yield Token("LITERAL", origin, False) elif token.lower() == "none": yield Token("LITERAL", origin, None) elif token in _magic_query_strings: yield Token("MAGIC_FIELD", origin, token) elif token == "_RECSPAN_INFO": yield Token("_RECSPAN_INFO", origin, token) else: yield Token("ATTR", origin, token) i = match.end() continue match = whitespace_re.match(string, i) if match is not None: i = match.end() continue for punct_token in punct_op_tokens: if string[i:i + len(punct_token)] == punct_token: yield Token(punct_token, Origin(string, i, i + len(punct_token))) i += len(punct_token) break else: raise EventsError("unrecognized token", Origin(string, i, i + 1))