def regex_tokenizer(regex, ignore_white_space): d = {term.Name: term for term in self.G.terminals} tokens = [] symbol_term = [ term for term in self.G.terminals if term.Name == 'symbol' ][0] fixed_tokens = { tok.Name: Token(tok.Name, tok) for tok in [ d['|'], d['*'], d['+'], d['?'], d['('], d[')'], d['['], d[']'], d['-'], d['ε'] ] } for i, c in enumerate(regex): if c == '@' or (ignore_white_space and c.isspace()): continue try: token = fixed_tokens[c] if regex[i - 1] == '@': raise KeyError except KeyError: token = Token(c, symbol_term) tokens.append(token) tokens.append(Token('$', self.G.EOF)) return tokens
def preprocessor(self): pos = self.line_pos() tkn_value = "" while self.peek_char(): tkn_value += self.peek_char() self.pop_char() if self.peek_sub_string(2) == "\\\n": self.__line_pos = 1 self.__line += 1 # raise TokenError(self.line_pos()) if self.peek_sub_string(2) in ["//", "/*"] \ or self.peek_char() == '\n': break if len(tkn_value) <= 1: raise TokenError(self.line_pos()) tkn_key = tkn_value[1:].split()[0] if tkn_key not in preproc_keywords and tkn_key[:len('include')] != 'include': raise TokenError(self.line_pos()) else: if tkn_key not in preproc_keywords and tkn_key[:len('include')] == 'include': tkn_key = 'include' self.tokens.append(Token( preproc_keywords.get(tkn_key), pos, tkn_value))
def _add_dedent(self, token): """根据DEDENT.value个数 拆成value个DEDENT""" num = 0 while num != token.value: num += 1 self.token_stream.append( Token(tokens.DEDENT, None, token.line, token.column))
def string(self): """String constants can contain any characer except unescaped newlines. An unclosed string or unescaped newline is a fatal error and thus parsing will stop here. """ pos = self.line_pos() tkn_value = "" if self.peek_char() == 'L': tkn_value += self.peek_char() self.pop_char() tkn_value += self.peek_char() self.pop_char() while self.peek_char() not in [None]: tkn_value += self.peek_char() if self.peek_sub_string(2) == "\\\n": self.__line += 1 self.__line_pos = 1 if self.peek_char() == '\"': break self.pop_char() else: raise TokenError(pos) return self.tokens.append(Token("STRING", pos, tkn_value)) self.pop_char()
def char_constant(self): """Char constants follow pretty much the same rule as string constants """ pos = self.line_pos() tkn_value = '\'' self.pop_char() while self.peek_char(): tkn_value += self.peek_char() if self.peek_char() == '\n': self.pop_char() self.tokens.append(Token("TKN_ERROR", pos)) return if self.peek_char() == '\'': self.pop_char() self.tokens.append(Token("CHAR_CONST", pos, tkn_value)) return self.pop_char() raise TokenError(pos)
def identifier(self): """Identifiers can start with any letter [a-z][A-Z] or an underscore and contain any letters [a-z][A-Z] digits [0-9] or underscores """ pos = self.line_pos() tkn_value = "" while self.peek_char() and \ ( self.peek_char() in string.ascii_letters + "0123456789_" or self.peek_char() == "\\\n"): if self.peek_char() == "\\\n": self.pop_char() continue tkn_value += self.peek_char() self.pop_char() if tkn_value in keywords: self.tokens.append(Token(keywords[tkn_value], pos)) else: self.tokens.append(Token("IDENTIFIER", pos, tkn_value))
def comment(self): """Comments are anything after '//' characters, up until a newline or end of file """ pos = self.line_pos() tkn_value = "//" self.pop_char(), self.pop_char() while self.peek_char() is not None: if self.peek_char() == '\n': self.tokens.append(Token("COMMENT", pos, tkn_value)) return tkn_value += self.peek_char() self.pop_char() raise TokenError(pos)
def operator(self): """Operators can be made of one or more sign, so the longest operators need to be looked up for first in order to avoid false positives eg: '>>' being understood as two 'MORE_THAN' operators instead of one 'RIGHT_SHIFT' operator """ pos = self.line_pos() if self.peek_char() in ".+-*/%<>^&|!=": if self.peek_sub_string(3) in [">>=", "<<=", "..."]: self.tokens.append(Token( operators[self.peek_sub_string(3)], pos)) self.__pos += 3 elif self.peek_sub_string(2) in [">>", "<<", "->"]: self.tokens.append(Token( operators[self.peek_sub_string(2)], pos)) self.__pos += 2 elif self.peek_sub_string(2) == self.peek_char() + "=": self.tokens.append(Token( operators[self.peek_sub_string(2)], pos)) self.pop_char(), self.pop_char() elif self.peek_char() in "+-<>=&|": if self.peek_sub_string(2) == self.peek_char() * 2: self.tokens.append(Token( operators[self.peek_sub_string(2)], pos)) self.pop_char() self.pop_char() else: self.tokens.append(Token( operators[self.peek_char()], pos)) self.pop_char() else: self.tokens.append(Token( operators[self.peek_char()], pos)) self.pop_char() else: self.tokens.append(Token( operators[self.src[self.__pos]], pos)) self.pop_char()
def mult_comment(self): pos = self.line_pos() self.pop_char(), self.pop_char() tkn_value = "/*" while self.peek_char(): if self.src[self.__pos:].startswith("*/"): tkn_value += "*/" self.pop_char(), self.pop_char() break tkn_value += self.peek_char() if self.peek_char() == '\n': self.__line += 1 self.__line_pos = 1 self.pop_char() if tkn_value.endswith("*/"): self.tokens.append(Token("MULT_COMMENT", pos, tkn_value)) else: raise TokenError(pos)
def get_next_token(self): """Peeks one character and tries to match it to a token type, if it doesn't match any of the token types, an error will be raised and current file's parsing will stop """ while self.peek_char() is not None: if self.is_string(): self.string() elif (self.peek_char().isalpha() and self.peek_char().isascii())or self.peek_char() == '_': self.identifier() elif self.is_constant(): self.constant() elif self.is_char_constant(): self.char_constant() elif self.peek_char() == '#': self.preprocessor() elif self.src[self.__pos:].startswith("/*"): self.mult_comment() elif self.src[self.__pos:].startswith("//"): self.comment() elif self.peek_char() in "+-*/,<>^&|!=%;:.~?": self.operator() elif self.peek_char() == ' ': self.tokens.append(Token("SPACE", self.line_pos())) self.pop_char() elif self.peek_char() == '\t': self.tokens.append(Token("TAB", self.line_pos())) self.pop_char() elif self.peek_char() == '\n':# or ord(self.peek_char()) == 8203: self.tokens.append(Token("NEWLINE", self.line_pos())) self.pop_char() self.__line_pos = 1 self.__line += 1 elif self.peek_char() == '\\\n': self.tokens.append(Token("ESCAPED_NEWLINE", self.line_pos())) self.pop_char() self.__line_pos = 1 self.__line += 1 elif self.peek_char() in brackets: self.tokens.append(Token( brackets[self.peek_char()], self.line_pos())) self.pop_char() else: raise TokenError(self.line_pos()) return self.peek_token() return None
def constant(self): """Numeric constants can take many forms: - integer constants only allow digits [0-9] - real number constant only allow digits [0-9], ONE optionnal dot '.' and ONE optionnal 'e/E' character - binary constant only allow digits [0-1] prefixed by '0b' or '0B' - hex constant only allow digits [0-9], letters [a-f/A-F] prefixed by '0x' or '0X' - octal constants allow digits [0-9] prefixed by a zero '0' character Size ('l/L' for long) and sign ('u/U' for unsigned) specifiers can be appended to any of those. tokens Plus/minus operators ('+'/'-') can prefix any of those tokens a numeric constant could start with a '.' (dot character) """ pos = self.line_pos() tkn_value = "" bucket = ".0123456789aAbBcCdDeEfFlLuUxX-+" while self.peek_char() and (self.peek_char() in bucket or self.peek_char() == "\\\n"): if self.peek_char() in "xX": if tkn_value.startswith("0") is False or len(tkn_value) > 1: raise TokenError(pos) for c in "xX": if c in tkn_value: raise TokenError(pos) elif self.peek_char() in "bB": if tkn_value != "0" \ and tkn_value.startswith("0x") is False \ and tkn_value.startswith("0X") is False: raise TokenError(pos) elif self.peek_char() in "+-": if tkn_value.endswith("e") is False \ and tkn_value.endswith("E") is False \ or self.peek_sub_string(2) in ["++", "--"]: break elif self.peek_char() in "eE" \ and "0x" not in tkn_value and "0X" not in tkn_value: if "e" in tkn_value or "E" in tkn_value \ or "f" in tkn_value or "F" in tkn_value \ or "u" in tkn_value or "U" in tkn_value \ or "l" in tkn_value or "L" in tkn_value: raise TokenError(pos) elif self.peek_char() in "lL": lcount = tkn_value.count("l") + tkn_value.count("L") if lcount > 1 or (lcount == 1 and tkn_value[-1] not in "lL") \ or ("f" in tkn_value or "F" in tkn_value) \ and "0x" not in tkn_value and "0X" not in tkn_value: raise TokenError(pos) elif self.peek_char() == 'l' and 'L' in tkn_value \ or self.peek_char() == 'L' and 'l' in tkn_value: raise TokenError(pos) elif self.peek_char() in "uU": if "u" in tkn_value or "U" in tkn_value \ or (("e" in tkn_value or "E" in tkn_value or "f" in tkn_value or "F" in tkn_value) and ( "0x" not in tkn_value and "0X" not in tkn_value)): raise TokenError(pos) elif self.peek_char() in "Ff": if tkn_value.startswith("0x") is False \ and tkn_value.startswith("0X") is False \ and ( "." not in tkn_value or "f" in tkn_value or "F" in tkn_value) \ or "u" in tkn_value or "U" in tkn_value \ or "l" in tkn_value or "L" in tkn_value: raise TokenError(pos) elif self.peek_char() in "aAbBcCdDeE" \ and tkn_value.startswith("0x") is False \ and tkn_value.startswith("0X") is False \ or "u" in tkn_value or "U" in tkn_value \ or "l" in tkn_value or "L" in tkn_value: raise TokenError(pos) elif self.peek_char() in "0123456789" \ and "u" in tkn_value or "U" in tkn_value \ or "l" in tkn_value or "L" in tkn_value: raise TokenError(pos) elif self.peek_char() == '.' and '.' in tkn_value: raise TokenError(pos) tkn_value += self.peek_char() self.pop_char() if tkn_value[-1] in "eE" and tkn_value.startswith("0x") is False \ or tkn_value[-1] in "xX": raise TokenError(pos) else: self.tokens.append(Token("CONSTANT", pos, tkn_value))
def make_token(self, type, value=None): return Token(type=type, value=value, line=self.line_no, column=self.position) # todo offset '\t'