def __init__(self): """ Class initializer. """ # Call the superclass initializer Parser.__init__(self) # Set the containers we're going to recognize self._containers = [('(', ')'), ('{', '}'), ('[', ']')] # Define our token pattern matches tmat = self._token_matches tmat.append(TokenMatch(re.compile(r"(/\*)(?s).*?(\*/)"), [CommentToken])) tmat.append(TokenMatch(re.compile(r"//.*"), [CommentToken])) tmat.append(TokenMatch(re.compile(r"#(?:(?!/\*|//).)*"), [PreprocToken])) tmat.append(TokenMatch(re.compile(r'''(["/']).*?(?<!\\)(\\\\)*\1'''), [StringToken])) tmat.append(TokenMatch(re.compile(r"[a-zA-Z_][\w]*"), [IdentifierToken]))
def __init__(self): """ Class initializer. """ # Call the superclass initializer Parser.__init__(self) # Set the containers we're going to recognize self._containers = [('(', ')'), ('{', '}'), ('[', ']')] # Define our token pattern matches tmat = self._token_matches # Search order matters, here. Patterns will be matched in # the order in which they appear in this list. Look for # comments, first, as they override everything, and then # look for multi-line strings, then strings, then backticks. # Backticks are deprecated, and removed in Python 3, but # retained here for backwards compatibility. tmat.append(TokenMatch(re.compile(r"#.*"), [CommentToken])) tmat.append(TokenMatch(re.compile(r"r?([\"|\']{3})[^\1]*?" + r"(?<!\\)(\\\\)*\1"), [MLStringToken])) tmat.append(TokenMatch(re.compile(r'''r?(["|']).*?(?<!\\)(\\\\)*\1'''), [StringToken])) tmat.append(TokenMatch(re.compile(r"r?([`]).*?(?<!\\)(\\\\)*\1"), [BacktickToken])) # Decorators and definitions go next. Note that we include # periods within the match for a decorator, unlike for regular # identifiers tmat.append(TokenMatch(re.compile(r"@[a-zA-Z_][\w\.]*"), [DecoratorToken])) tmat.append(TokenMatch(re.compile(r"(def)(\s+)([a-zA-Z_][\w]*)"), [KeywordToken, WhitespaceToken, DefinitionToken])) tmat.append(TokenMatch(re.compile(r"(class)(\s+)([a-zA-Z_][\w]*)"), [KeywordToken, WhitespaceToken, DefinitionToken])) # Match regular identifiers, next. First check if an identifier # is preceded by a period, since if it is, we should not treat # it as a builtin or a keyword. If it's not, then label it a # FirstIdentifierToken, and _get_token() will replace it with a # KeywordToken or a BuiltinToken, if necessary. tmat.append(TokenMatch(re.compile(r"(\.)([a-zA-Z_][\w]*)"), [SeparatorToken, IdentifierToken])) tmat.append(TokenMatch(re.compile(r"[a-zA-Z_][\w]*"), [FirstIdentifierToken])) # Match numbers. Start with floats which start with a number # rather than a period, then floats which start with a period. # Note that we cannot make numbers both before and after the # period as optional in the same regular expression, or a plain # period will match as a float. # # Then match integers. Include an optional j on the end to # catch complex numbers. Note that, currently, the real and # imaginary parts of a complex number will be captured as two # separate numbers with an operator between them, which is # probably not ideal, and may be changed in the future. tmat.append(TokenMatch(re.compile(r"[0-9]+[\.][0-9]*((e|E)[\+\-]" + r"?[0-9]+)?(J|j)?"), [FloatToken])) tmat.append(TokenMatch(re.compile(r"[\.][0-9]+((e|E)[\+\-]?" + r"[0-9]+)?(j|J)?"), [FloatToken])) tmat.append(TokenMatch(re.compile(r"(0x)?[0-9]+(L|l)?(J|j)?"), [IntegerToken])) # Look for assignment delimiter tokens, except the # regular '=' operator tmat.append(TokenMatch(re.compile(r"(\+=|\-=|\*=|/=|%=|//=|\*\*=)"), [DelimiterToken])) # Look for multi-character operators tmat.append(TokenMatch(re.compile(r"(\*\*|<<|>>|<=|>=|<>|==|!=|//)"), [OperatorToken])) # Look for the '=' operator only after matching any # multi-line operators, in particular we would never # match the '==' operator if we looked for the '=' # operator first tmat.append(TokenMatch(re.compile(r"="), [DelimiterToken])) # Look for single character operators tmat.append(TokenMatch(re.compile(r"[\+\*\-\/%~&\^\|<>]"), [OperatorToken])) # Finally, look for single character separators tmat.append(TokenMatch(re.compile(r"[,:\.]"), [SeparatorToken]))