def make_tokens(self): tokens = [] while self.char_now is not None: if self.char_now in " \t": # skip spaces and tabs self.advance() elif self.char_now in DIGITS: tokens.append(self.make_number()) elif self.char_now in get_chars_of( "op"): # operators e.g. + - * / ^ tokens.append( Token(get_chars_of("op")[self.char_now], pos_start=self.pos)) self.advance() elif self.char_now in get_chars_of("brac"): # brackets e.g. ( ) tokens.append( Token(get_chars_of("brac")[self.char_now], pos_start=self.pos)) self.advance() else: # Illegal Character pos_start = self.pos.copy() char = self.char_now self.advance() return [], IllegalCharError(pos_start, self.pos, f"'{char}' is not implemented.") tokens.append(get_eof_token(self.pos.copy())) return tokens, None # None for no error
def read_directive(self): begin = self.i while ( self.ch and self.ch != "\n" and (not self.next_two_char() in {"//", "/*"}) ): self.advance() return Token(Token.DIRECTIVE, self.text[begin : self.i].rstrip(" "))
def add_token(self, type: TokenType, literal: object = None): text = self.source[self.start:self.current] self.tokens.append( Token(type=type, lexeme=text, literal=literal, line=self.line) )
def __symbol(self): if self.current_char not in SYMBOL_LOOKUP: return if self.next_char not in SYMBOL_LOOKUP: self.__advance() return Token(SYMBOL_LOOKUP[self.last_char]) symbol = self.current_char + self.next_char if symbol in SYMBOL_LOOKUP: self.__advance() self.__advance() return Token(SYMBOL_LOOKUP[symbol]) self.__advance() return Token(SYMBOL_LOOKUP[self.last_char])
def make_tokens(self): tokens = [] while self.current_char != None: if self.current_char in ' \t': self.advance() elif self.current_char in DIGITS: tokens.append(self.make_number()) elif self.current_char == '+': tokens.append(Token(TT_PLUS, pos_start=self.pos)) self.advance() elif self.current_char == '-': tokens.append(Token(TT_MINUS, pos_start=self.pos)) self.advance() elif self.current_char == '*': tokens.append(Token(TT_MUL, pos_start=self.pos)) self.advance() elif self.current_char == '/': tokens.append(Token(TT_DIV, pos_start=self.pos)) self.advance() elif self.current_char == '(': tokens.append(Token(TT_LPAREN, pos_start=self.pos)) self.advance() elif self.current_char == ')': tokens.append(Token(TT_RPAREN, pos_start=self.pos)) self.advance() else: pos_start = self.pos.copy() char = self.current_char self.advance() return [], IllegalCharError(pos_start, self.pos, "'" + char + "'") tokens.append(Token(TT_EOF, pos_start=self.pos)) return tokens, None
def make_number(self): num_str = '' dot_count = 0 pos_start = self.pos.copy() while self.current_char != None and self.current_char in DIGITS + '.': if self.current_char == '.': if dot_count == 1: break dot_count += 1 num_str += '.' else: num_str += self.current_char self.advance() if dot_count == 0: return Token(TT_INT, int(num_str), pos_start, self.pos) else: return Token(TT_FLOAT, float(num_str), pos_start, self.pos)
def scan(self): while not self.is_at_end(): self.start = self.current self.scan_token() print(self.tokens) token = Token( type=TokenType.EOF, lexeme="", literal=None, line=self.line ) self.tokens.append(token)
def read_blockcomment(self): begin = self.i while self.ch and self.next_two_char() != "*/": self.advance() if self.next_two_char() != "*/": raise AsirSyntaxError( "Expect: '*/', got: '{}' at line {}".format( self.next_two_char(), self.detect_line_number() ) ) self.advance() self.advance() return Token(Token.BLOCKCOMMENT, self.text[begin : self.i])
def make_number(self): number_str = "" has_dot = False pos_start = self.pos.copy() while self.char_now is not None and self.char_now in DIGITS + ".": if self.char_now == ".": if has_dot: break number_str += "." has_dot = True else: number_str += self.char_now self.advance() if has_dot: return Token( get_chars_of("factor")["FLOAT"], float(number_str), pos_start, self.pos) else: return Token( get_chars_of("factor")["INT"], int(number_str), pos_start, self.pos)
def read_string(self): begin = self.i self.advance() while self.ch and self.ch != '"': if self.ch == "\\": # escape self.advance() self.advance() if self.ch != '"': raise AsirSyntaxError( "Expect: '\"', got: '{}' at line {}".format( self.ch, self.detect_line_number() ) ) self.advance() return Token(Token.STRING, self.text[begin : self.i])
def __word(self): result = '' if not self.current_char.isalpha(): return result += self.current_char self.__advance() while self.current_char is not None and (self.current_char.isalnum() or self.current_char == '_'): result += self.current_char self.__advance() key = result.lower() if key in RESERVED_LOOKUP: return Token(RESERVED_LOOKUP[key]) else: return ValueToken(TOKEN_IDENTIFIER, result)
def __end_of_file(self): if self.current_char is None: return Token(TOKEN_EOF)
def __end_of_line(self): if self.current_char == '\n': self.__advance() return Token(TOKEN_EOL)
def beautify(self): prev = Token("", "") semicolon_cnt, inside_for = 0, False while not self.le.is_end(): t = self.le.read_token() if t.token_type == Token.LINECOMMENT: self.append_linecomment(t.content, prev.token_type) elif t.token_type == Token.BLOCKCOMMENT: self.append_blockcomment(t.content) elif t.token_type == Token.OPERATOR: if t.content == "!": self.append_content("!") # 前置 elif t.content in {"++", "--"}: if prev.token_type == Token.OPERATOR: self.append_content(t.content, " ") # ... * ++ else: self.append_after_rstrip(t.content, " ") # A++ など elif t.content == "-": if prev.token_type in { "", Token.COMMA, Token.SEMICOLON, Token.LPAR, }: self.append_content("-") # ... (- elif prev.content in {"=", "==", "<", "<=", ">", ">="}: self.append_content("-") # ... == - else: self.append_content("-", " ") else: self.append_content(t.content, " ") elif t.token_type == Token.LPAR: if prev.content in {"for", "if"}: self.append_content("(") # ... for ( elif prev.token_type == Token.WORD: # 関数呼び出し self.append_after_rstrip("(") # ... func( else: self.append_content("(") # ... + ( elif t.token_type == Token.RPAR: self.append_after_rstrip(")", " ") elif t.token_type == Token.LBRACE: self.append_content("{") self.append_current_line() self.depth += 1 elif t.token_type == Token.RBRACE: self.append_current_line() self.depth -= 1 self.append_content("}") self.append_current_line() elif t.token_type == Token.LBRACKET: if prev.token_type == Token.WORD: # 添字アクセス self.append_after_rstrip("[") # ... arr[ else: self.append_content("[") # ... = [ elif t.token_type == Token.RBRACKET: self.append_after_rstrip("]", " ") elif t.token_type == Token.COMMA: self.append_after_rstrip(",", " ") elif t.token_type == Token.SEMICOLON: if inside_for: semicolon_cnt += 1 if semicolon_cnt == 2: inside_for = False self.append_after_rstrip(";", " ") # for(a; b; else: self.append_after_rstrip(";") self.append_current_line() elif t.token_type == Token.END: self.append_after_rstrip("$") self.append_current_line() elif t.token_type == Token.STRING: self.append_content(t.content) elif t.token_type == Token.WORD: if t.content == "else": if self.output_lines[-1].lstrip(" ") == "}": self.output_lines.pop() self.append_content("}" + " " + "else", " ") # if (cond) { # # } else else: self.append_content("else", " ") # if (cond) return 1; # else else: if prev.content in {"++", "--"}: self.append_after_rstrip(t.content, " ") # ... ++a else: self.append_content(t.content, " ") if t.content == "for": inside_for = True semicolon_cnt = 0 elif t.token_type == Token.DIRECTIVE: if len(self.current_line) >= 1: self.append_current_line() self.output_lines.append(t.content) # インデント無し else: raise AsirSyntaxError( # ? "Unknown token. type: {}, content: '{}'".format( t.token_type, t.content)) prev = t if len(self.current_line) >= 1: self.append_current_line() return "\n".join(self.output_lines).strip()
def read_token(self): self.skip_whitespace() t = Token("", "") if self.next_two_char() == "//": return self.read_linecomment() if self.next_two_char() == "/*": return self.read_blockcomment() if self.ch in self.ops: if self.ch in {"?", ":"}: t = Token(Token.OPERATOR, self.ch) elif self.ch == "&": if self.next_char() != "&": raise AsirSyntaxError( "Expect: '&', got: '{}' at line {}".format( self.next_char(), self.detect_line_number() ) ) t = Token(Token.OPERATOR, "&&") self.advance() elif self.ch == "|": if self.next_char() == "|": t = Token(Token.OPERATOR, "||") self.advance() else: t = Token(Token.OPERATOR, "|") elif self.ch in {"+", "-"}: if self.next_char() in {self.ch, "="}: t = Token(Token.OPERATOR, self.ch + self.next_char()) self.advance() else: t = Token(Token.OPERATOR, self.ch) else: if self.next_char() == "=": t = Token(Token.OPERATOR, self.ch + "=") self.advance() else: t = Token(Token.OPERATOR, self.ch) elif self.ch == "(": t = Token(Token.LPAR, "(") elif self.ch == ")": t = Token(Token.RPAR, ")") elif self.ch == "{": self.depth += 1 t = Token(Token.LBRACE, "{") elif self.ch == "}": self.depth -= 1 self.ensure_positive_depth() t = Token(Token.RBRACE, "}") elif self.ch == "[": t = Token(Token.LBRACKET, "[") elif self.ch == "]": t = Token(Token.RBRACKET, "]") elif self.ch == ",": t = Token(Token.COMMA, ",") elif self.ch == ";": t = Token(Token.SEMICOLON, ";") elif self.ch == "$": t = Token(Token.END, "$") elif self.ch == '"': return self.read_string() elif self.ch == "#": return self.read_directive() else: return self.read_word() self.advance() return t
def read_word(self): begin = self.i while self.ch and (not self.ch in (self.delims | {'"', " ", "\n", "\t"})): self.advance() assert self.ch is None or begin < self.i return Token(Token.WORD, self.text[begin : self.i])
def read_linecomment(self): begin = self.i while self.ch and self.ch != "\n": self.advance() return Token(Token.LINECOMMENT, self.text[begin : self.i])