def read_token(self): # Skip whitespace. char = self.peek_char() newline = False while char and char.isspace(): if char == '\n': newline = True self.read_char() char = self.peek_char() lexeme = '' row = self.row column = self.column if newline: return Token(NEWLINE, '\n', row=row, column=column) num_attempts = 0 token = None while True: char = self.peek_char() # Maybe we have reached the end of the source? if not char: break lexeme += char # Attempt to create a token from the lexeme. temp = self.create_token(lexeme) if not temp: if not token: num_attempts += 1 if num_attempts > 10: break else: break token = temp # Consume the character that made the token creation possible. self.read_char() if len(lexeme) == 0: # We've reached the end of the source. token = Token(EOF) if token: token.row = row token.column = column #print token return token mshl.error('sequence not understood: {}'.format(lexeme))
def parse_str(parser): tok = parser.expect(lexemes.STR) value = str(tok.lexeme) if ((not value.startswith('"') or not value.endswith('"')) and (not value.startswith('\'') or not value.endswith('\''))): mshl.error("unterminated string", tok) value = value[1:-1] value = value.replace('\\\\', '\\').replace('\\\'', '\'') return Node(STRING, value, tok)
def expect(self, *args): tokens = [] for lexeme in args: token = self.read_token() if token.category != lexeme: mshl.error('expected {}'.format(lexeme), token) tokens.append(token) if len(tokens) == 1: tokens = tokens[0] return tokens
def parse_int(parser): tok = parser.expect(lexemes.INT) #if len(tok.lexeme) > 10: # parser.warn("integer too large", tok) value = tok.lexeme if value.startswith("0x"): if len(value) == 2: mshl.error("invalid hex value", tok) value = 0 else: value = int(value[2:], 16) elif value.endswith("b"): value = int(value[:-1], 2) return Node(INTEGER, int(value), tok)
def parse_expr4(parser): parser.eat_whitespace() expr = None tok = parser.peek_token() # (<expr>) if tok.category == lexemes.L_PAREN: parser.expect(lexemes.L_PAREN) expr = parse_expr(parser) parser.expect(lexemes.R_PAREN) # [<expr>[, <expr> ...]] elif tok.category == lexemes.L_BRACK: parser.expect(lexemes.L_BRACK) items = [] while True: tok = parser.peek_token() if tok.category == lexemes.R_BRACK: break parser.eat_whitespace() items.append(parse_expr(parser)) parser.eat_whitespace() tok = parser.peek_token() if tok.category == lexemes.R_BRACK: break parser.expect(lexemes.COMMA) parser.eat_whitespace() parser.expect(lexemes.R_BRACK) expr = Node(ARRAY, token=tok, children=items) # <identifier> elif tok.category == lexemes.IDENT: expr = parse_ident(parser) # -<integer> elif tok.category == lexemes.MINUS_SIGN: parser.read_token() expr = parse_int(parser) expr.data = -int(expr.data) # <integer> elif tok.category == lexemes.INT: expr = parse_int(parser) # <string> elif tok.category == lexemes.STR: expr = parse_str(parser) # for ([<expr>]; [<expr>]; [<expr>]) ... elif tok.category == lexemes.FOR: expr = parse_for(parser) # func <identifier>(...) elif tok.category == lexemes.FUNC: expr = parse_func(parser) # if (<expr>) ... elif tok.category == lexemes.IF: expr = parse_if(parser) # while (<expr>) ... elif tok.category == lexemes.WHILE: expr = parse_while(parser) # return <expr> elif tok.category == lexemes.RETURN: parser.read_token() newline_after_return = False if parser.peek_token().category == lexemes.NEWLINE: newline_after_return = True parser.eat_whitespace() if parser.peek_token().category in (lexemes.R_BRACE, lexemes.SEMICOLON): expr = Node(RETURN, children=[Node(INTEGER, 0, tok)]) else: if newline_after_return: mshl.warning('prefer semicolon or expression on same row as return keyword', tok) expr = Node(RETURN, children=[parse_expr(parser)]) # true elif tok.category == lexemes.TRUE: parser.read_token() expr = Node(INTEGER, 1) # false elif tok.category == lexemes.FALSE: parser.read_token() expr = Node(INTEGER, 0) # undefined (should not be used this way, see == operator) #elif tok.category == lexemes.UNDEFINED: # parser.read_token() # expr = Node(INTEGER, 0) # mshl.warning('undefined should only be used in equality tests') elif tok.category == lexemes.NOT: parser.read_token() parser.eat_whitespace() expr = parse_expr(parser) expr = Node(IF_TERNARY, children=[expr, Node(INTEGER, 0, tok), Node(INTEGER, 1, tok)]) elif tok.category == lexemes.BREAK: parser.read_token() expr = Node(BREAK) elif tok.category == lexemes.CONTINUE: parser.read_token() expr = Node(CONTINUE) # <eof> | <newline> #elif tok.category in (lexemes.EOF, lexemes.NEWLINE): # pass elif tok.category != lexemes.EOF: parser.read_token() mshl.error("unexpected token: {}".format(tok.category), tok) if expr: expr.token = tok return expr