Example #1
0
 def make_token(scanner, token_string):
     if type == "__OP__":
         actual_type = token_string
     else:
         actual_type = type
     return Token(actual_type, Origin(string, *scanner.match.span()),
                  token_string)
Example #2
0
def _read_quoted_string(string, i):
    start = i
    quote_type = string[i]
    assert quote_type in "\"'`"
    chars = []
    i += 1
    while i < len(string):
        char = string[i]
        if char == quote_type:
            break
        elif char == "\\":
            # Consume the backslash
            i += 1
            if i >= len(string):
                break
            escaped_char = string[i]
            if escaped_char in "\"'`\\":
                chars.append(escaped_char)
            else:
                raise EventsError(
                    "unrecognized escape sequence \\%s" % (escaped_char, ),
                    Origin(string, i - 1, i))
        else:
            chars.append(string[i])
        i += 1
    if i >= len(string):
        raise EventsError("unclosed string", Origin(string, start, i))
    assert string[i] == quote_type
    i += 1
    if quote_type == "`":
        type = "ATTR"
    else:
        type = "LITERAL"
    return Token(type, Origin(string, start, i), "".join(chars)), i
Example #3
0
def _tokenize_formula(code, operator_strings):
    assert "(" not in operator_strings
    assert ")" not in operator_strings
    magic_token_types = {"(": Token.LPAREN,
                         ")": Token.RPAREN,
                         }
    for operator_string in operator_strings:
        magic_token_types[operator_string] = operator_string
    # Once we enter a Python expression, a ( does not end it, but any other
    # "magic" token does:
    end_tokens = set(magic_token_types)
    end_tokens.remove("(")
    
    it = PushbackAdapter(python_tokenize(code))
    for pytype, token_string, origin in it:
        if token_string in magic_token_types:
            yield Token(magic_token_types[token_string], origin)
        else:
            it.push_back((pytype, token_string, origin))
            yield _read_python_expr(it, end_tokens)
Example #4
0
def _read_python_expr(it, end_tokens):
    # Read out a full python expression, stopping when we hit an
    # unnested end token.
    pytypes = []
    token_strings = []
    origins = []
    bracket_level = 0
    for pytype, token_string, origin in it:
        assert bracket_level >= 0
        if bracket_level == 0 and token_string in end_tokens:
            it.push_back((pytype, token_string, origin))
            break
        if token_string in ("(", "[", "{"):
            bracket_level += 1
        if token_string in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise PatsyError("unmatched close bracket", origin)
        pytypes.append(pytype)
        token_strings.append(token_string)
        origins.append(origin)
    # Either we found an end_token, or we hit the end of the string
    if bracket_level == 0:
        expr_text = pretty_untokenize(zip(pytypes, token_strings))
        if expr_text == "0":
            token_type = "ZERO"
        elif expr_text == "1":
            token_type = "ONE"
        elif _is_a(int, expr_text) or _is_a(float, expr_text):
            token_type = "NUMBER"
        else:
            token_type = "PYTHON_EXPR"
        return Token(token_type, Origin.combine(origins), extra=expr_text)
    else:
        raise PatsyError("unclosed bracket in embedded Python "
                            "expression",
                            Origin.combine(origins))
Example #5
0
def _tokenize(string):
    punct_op_tokens = [op.token_type for op in _punct_ops]
    punct_op_tokens.sort(key=len, reverse=True)
    text_op_tokens = [op.token_type for op in _text_ops]

    num_re = re.compile(r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?")
    # This works because \w matches underscore, letters, and digits.
    # But if a token starts with a digit, then it'll be caught by num_re above
    # first, so in fact this works like "[_a-z][_a-z0-9]*" except for being
    # unicode-enabled.
    ident_re = re.compile(r"\w+", re.IGNORECASE | re.UNICODE)
    whitespace_re = re.compile(r"\s+")

    i = 0
    while i < len(string):
        if "(" == string[i]:
            yield Token(Token.LPAREN, Origin(string, i, i + 1))
            i += 1
            continue
        if ")" == string[i]:
            yield Token(Token.RPAREN, Origin(string, i, i + 1))
            i += 1
            continue
        if string[i] in "\"'`":
            token, i = _read_quoted_string(string, i)
            yield token
            continue
        match = num_re.match(string, i)
        if match is not None:
            try:
                value = int(match.group())
            except ValueError:
                value = float(match.group())
            yield Token("LITERAL", Origin(string, *match.span()), value)
            i = match.end()
            continue
        match = ident_re.match(string, i)
        if match is not None:
            token = match.group()
            origin = Origin(string, *match.span())
            if token in text_op_tokens:
                yield Token(token, origin)
            elif token.lower() == "true":
                yield Token("LITERAL", origin, True)
            elif token.lower() == "false":
                yield Token("LITERAL", origin, False)
            elif token.lower() == "none":
                yield Token("LITERAL", origin, None)
            elif token in _magic_query_strings:
                yield Token("MAGIC_FIELD", origin, token)
            elif token == "_RECSPAN_INFO":
                yield Token("_RECSPAN_INFO", origin, token)
            else:
                yield Token("ATTR", origin, token)
            i = match.end()
            continue
        match = whitespace_re.match(string, i)
        if match is not None:
            i = match.end()
            continue
        for punct_token in punct_op_tokens:
            if string[i:i + len(punct_token)] == punct_token:
                yield Token(punct_token, Origin(string, i,
                                                i + len(punct_token)))
                i += len(punct_token)
                break
        else:
            raise EventsError("unrecognized token", Origin(string, i, i + 1))