#!/usr/bin/python """ parse_cpython.py """ from __future__ import print_function import errno import os import re import sys from core.util import log from frontend.lexer import C, R C_DEF = [ R(r'#.*', 'Comment'), R(r'[ \t\n]+', 'Whitespace'), # This could be more space-insensitive. R(r'static.*PyMethodDef (.*)\[\] = ', 'BeginDef'), C(r'{', 'LBrace'), C(r'}', 'RBrace'), C(r',', 'Comma'), C(r';', 'Semi'), R(r'"([^"]*)"', 'Str'), C(r'FILE', 'FILE'), C(r'PyDoc_STR(', 'LDocStr'), C(r')', 'RDocStr'), R(r'[^,}\n]+', 'Opaque'), ]
# EXPRESSION (takes place of ARITH, VSub_ArgUnquoted, VSub_ArgDQ) # SQ RAW_SQ DQ RAW_DQ # VS -- a single state here? Or switches into expression state, because } # is an operator # Problem: DICT_KEY might be a different state, to accept either a bare word # foo, or an expression (X=a+2), which is allowed in shell. Python doesn't # allowed unquoted words, but we want to. # TODO: There are 4 shared groups here. I think you should test if that # structure should be preserved through re2c. Do a benchmark. # # If a group has no matches, then return Id.Unknown_Tok? And then you can # chain the groups in order. It might make sense to experiment with the order # too. _SIGNIFICANT_SPACE = R(r'[ \t\r]+', Id.WS_Space) _BACKSLASH = [ R(r'\\[^\n\0]', Id.Lit_EscapedChar), C('\\\n', Id.Ignored_LineCont), ] VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*' # All Kind.VSub _VARS = [ # Unbraced variables R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName), R(r'\$[0-9]', Id.VSub_Number), C(r'$!', Id.VSub_Bang), C(r'$@', Id.VSub_At),
# SQ RAW_SQ DQ RAW_DQ # VS -- a single state here? Or switches into expression state, because } # is an operator # Problem: DICT_KEY might be a different state, to accept either a bare word # foo, or an expression (X=a+2), which is allowed in shell. Python doesn't # allowed unquoted words, but we want to. # TODO: There are 4 shared groups here. I think you should test if that # structure should be preserved through re2c. Do a benchmark. # # If a group has no matches, then return Id.Unknown_Tok? And then you can # chain the groups in order. It might make sense to experiment with the order # too. _BACKSLASH = [ R(r'\\[^\n\0]', Id.Lit_EscapedChar), C('\\\n', Id.Ignored_LineCont), ] VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*' # All Kind.VSub _VARS = [ # Unbraced variables R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName), R(r'\$[0-9]', Id.VSub_Number), C(r'$!', Id.VSub_Bang), C(r'$@', Id.VSub_At), C(r'$#', Id.VSub_Pound), C(r'$$', Id.VSub_Dollar), C(r'$*', Id.VSub_Star),