def make_token(scanner, token_string): if type == "__OP__": actual_type = token_string else: actual_type = type return Token(actual_type, Origin(string, *scanner.match.span()), token_string)
def _read_quoted_string(string, i): start = i quote_type = string[i] assert quote_type in "\"'`" chars = [] i += 1 while i < len(string): char = string[i] if char == quote_type: break elif char == "\\": # Consume the backslash i += 1 if i >= len(string): break escaped_char = string[i] if escaped_char in "\"'`\\": chars.append(escaped_char) else: raise EventsError( "unrecognized escape sequence \\%s" % (escaped_char, ), Origin(string, i - 1, i)) else: chars.append(string[i]) i += 1 if i >= len(string): raise EventsError("unclosed string", Origin(string, start, i)) assert string[i] == quote_type i += 1 if quote_type == "`": type = "ATTR" else: type = "LITERAL" return Token(type, Origin(string, start, i), "".join(chars)), i
def _tokenize_formula(code, operator_strings): assert "(" not in operator_strings assert ")" not in operator_strings magic_token_types = {"(": Token.LPAREN, ")": Token.RPAREN, } for operator_string in operator_strings: magic_token_types[operator_string] = operator_string # Once we enter a Python expression, a ( does not end it, but any other # "magic" token does: end_tokens = set(magic_token_types) end_tokens.remove("(") it = PushbackAdapter(python_tokenize(code)) for pytype, token_string, origin in it: if token_string in magic_token_types: yield Token(magic_token_types[token_string], origin) else: it.push_back((pytype, token_string, origin)) yield _read_python_expr(it, end_tokens)
def _read_python_expr(it, end_tokens): # Read out a full python expression, stopping when we hit an # unnested end token. pytypes = [] token_strings = [] origins = [] bracket_level = 0 for pytype, token_string, origin in it: assert bracket_level >= 0 if bracket_level == 0 and token_string in end_tokens: it.push_back((pytype, token_string, origin)) break if token_string in ("(", "[", "{"): bracket_level += 1 if token_string in (")", "]", "}"): bracket_level -= 1 if bracket_level < 0: raise PatsyError("unmatched close bracket", origin) pytypes.append(pytype) token_strings.append(token_string) origins.append(origin) # Either we found an end_token, or we hit the end of the string if bracket_level == 0: expr_text = pretty_untokenize(zip(pytypes, token_strings)) if expr_text == "0": token_type = "ZERO" elif expr_text == "1": token_type = "ONE" elif _is_a(int, expr_text) or _is_a(float, expr_text): token_type = "NUMBER" else: token_type = "PYTHON_EXPR" return Token(token_type, Origin.combine(origins), extra=expr_text) else: raise PatsyError("unclosed bracket in embedded Python " "expression", Origin.combine(origins))
def _tokenize(string): punct_op_tokens = [op.token_type for op in _punct_ops] punct_op_tokens.sort(key=len, reverse=True) text_op_tokens = [op.token_type for op in _text_ops] num_re = re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?") # This works because \w matches underscore, letters, and digits. # But if a token starts with a digit, then it'll be caught by num_re above # first, so in fact this works like "[_a-z][_a-z0-9]*" except for being # unicode-enabled. ident_re = re.compile(r"\w+", re.IGNORECASE | re.UNICODE) whitespace_re = re.compile(r"\s+") i = 0 while i < len(string): if "(" == string[i]: yield Token(Token.LPAREN, Origin(string, i, i + 1)) i += 1 continue if ")" == string[i]: yield Token(Token.RPAREN, Origin(string, i, i + 1)) i += 1 continue if string[i] in "\"'`": token, i = _read_quoted_string(string, i) yield token continue match = num_re.match(string, i) if match is not None: try: value = int(match.group()) except ValueError: value = float(match.group()) yield Token("LITERAL", Origin(string, *match.span()), value) i = match.end() continue match = ident_re.match(string, i) if match is not None: token = match.group() origin = Origin(string, *match.span()) if token in text_op_tokens: yield Token(token, origin) elif token.lower() == "true": yield Token("LITERAL", origin, True) elif token.lower() == "false": yield Token("LITERAL", origin, False) elif token.lower() == "none": yield Token("LITERAL", origin, None) elif token in _magic_query_strings: yield Token("MAGIC_FIELD", origin, token) elif token == "_RECSPAN_INFO": yield Token("_RECSPAN_INFO", origin, token) else: yield Token("ATTR", origin, token) i = match.end() continue match = whitespace_re.match(string, i) if match is not None: i = match.end() continue for punct_token in punct_op_tokens: if string[i:i + len(punct_token)] == punct_token: yield Token(punct_token, Origin(string, i, i + len(punct_token))) i += len(punct_token) break else: raise EventsError("unrecognized token", Origin(string, i, i + 1))