class Lexer: def __init__(self, fname, source): self.source = source self.fname = fname self.pos = Position(-1, 0, -1, fname, source) self.current_symbol = None self.advance() def advance(self): self.pos.advance(self.current_symbol) self.current_symbol = self.source[self.pos.index] if self.pos.index < len(self.source) else None def make_tokens(self): tokens = [] while self.current_symbol != None: if self.current_symbol in " \t": self.advance() elif self.current_symbol in DIGITS: tokens.append(self.make_number()) elif self.current_symbol in LETTERS + "_": tokens.append(self.make_identifier()) elif self.current_symbol == "+": tokens.append(Token(TT_ADD, pos_start=self.pos)) self.advance() elif self.current_symbol == "-": tokens.append(Token(TT_SUB, pos_start=self.pos)) self.advance() elif self.current_symbol == "*": tokens.append(Token(TT_MUL, pos_start=self.pos)) self.advance() elif self.current_symbol == "/": tokens.append(Token(TT_DIV, pos_start=self.pos)) self.advance() elif self.current_symbol == "^": tokens.append(Token(TT_POW, pos_start=self.pos)) self.advance() elif self.current_symbol == "(": tokens.append(Token(TT_LPAREN, pos_start=self.pos)) self.advance() elif self.current_symbol == ")": tokens.append(Token(TT_RPAREN, pos_start=self.pos)) self.advance() elif self.current_symbol == "!": token, error = self.make_not_equals() if error: return [], error tokens.append(token) elif self.current_symbol == "=": tokens.append(self.make_equals()) elif self.current_symbol == "<": tokens.append(tself.make_less_than()) elif self.current_symbol == ">": tokens.append(tself.make_greater_than()) else: pos_start = self.pos.copy() illegal_symbol = self.current_symbol self.advance() return [], IllegalTokenError(pos_start, self.pos, f"'{illegal_symbol}'") tokens.append(Token(TT_EOF, pos_start=self.pos)) return tokens, None def make_number(self): num_s = "" dot_c = 0 pos_start = self.pos.copy() while self.current_symbol != None and self.current_symbol in DIGITS + ".": if self.current_symbol == ".": if dot_c == 1: break dot_c += 1 num_s += "." else: num_s += self.current_symbol self.advance() if dot_c == 0: return Token(TT_INT, int(num_s), pos_start, self.pos) else: return Token(TT_DEC, float(num_s), pos_start, self.pos) def make_identifier(self): id_str = "" pos_start = self.pos.copy() while self.current_symbol != None and self.current_symbol in LETTERS_DIGITS + "_": id_str += self.current_symbol self.advance() token_type = TT_KEY if id_str in KEYWORDS else TT_ID return Token(token_type, id_str, pos_start, self.pos) def make_not_equals(self): pos_start = self.pos.copy() self.advance() if self.current_symbol == "=": self.advance() return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None invalid_symbol = self.current_symbol self.advance() return None, ExpectedSymbolError(pos_start, self.pos, f"Unxpected symbol '{invalid_symbol}', expected '='") def make_equals(self): token_type = TT_EQ pos_start = self.pos.copy() self.advance() if self.current_symbol == "=": self.advance() token_type = TT_EE return Token(token_type, pos_start=pos_start, pos_end=self.pos) def make_less_than(self): token_type = TT_LT pos_start = self.pos.copy() self.advance() if self.current_symbol == "=": self.advance() token_type = TT_LTE return Token(token_type, pos_start=pos_start, pos_end=pos_end) def make_greater_than(self): token_type = TT_GT pos_start = self.pos.copy() self.advance() if self.current_symbol == "=": self.advance() token_type = TT_GTE return Token(token_type, pos_start=pos_start, pos_end=pos_end)
class Lexer(object): """ Lexer 词法分析 """ def __init__(self, fn, text): self.fn = fn # text来源 => 某个文件,方便报错定位 self.text = text self.pos = Position(-1, 0, -1, fn, text) # 位置 self.current_char = None # 当前字符 self.advance() # self.pos从-1开始,然后立刻调用self.advance def advance(self): """预读""" self.pos.advance(self.current_char) if self.pos.idx < len(self.text): self.current_char = self.text[self.pos.idx] else: self.current_char = None def make_tokens(self): tokens = [] while self.current_char != None: if self.current_char in (' ', '\t'): # 为空格或制表符,直接跳过 self.advance() elif self.current_char == '#': # 跳过注释 self.skip_comment() elif self.current_char in DIGITS: # 识别数字 tokens.append(self.make_number()) elif self.current_char in LETTERS: # 识别字母 tokens.append(self.make_identifier()) elif self.current_char == '!': token, error = self.make_not_equals() if error: return [], error tokens.append(token) elif self.current_char == '=': tokens.append(self.make_equals()) elif self.current_char == '<': tokens.append(self.make_less_than()) elif self.current_char == '>': tokens.append(self.make_greater_than()) elif self.current_char == '^': # 幂操作 x^y => x的y次幂 tokens.append(Token(TT_POW, pos_start=self.pos)) self.advance() elif self.current_char == '+': tokens.append(Token(TT_PLUS, pos_start=self.pos)) self.advance() elif self.current_char == '-': tokens.append(self.make_minus_or_arrow()) elif self.current_char == '*': tokens.append(Token(TT_MUL, pos_start=self.pos)) self.advance() elif self.current_char == '/': tokens.append(Token(TT_DIV, pos_start=self.pos)) self.advance() elif self.current_char == '(': tokens.append(Token(TT_LPAREN, pos_start=self.pos)) self.advance() elif self.current_char == ')': tokens.append(Token(TT_RPAREN, pos_start=self.pos)) self.advance() elif self.current_char == '[': tokens.append(Token(TT_LSQUARE, pos_start=self.pos)) self.advance() elif self.current_char == ']': tokens.append(Token(TT_RSQUARE, pos_start=self.pos)) self.advance() elif self.current_char == ',': tokens.append(Token(TT_COMMA, pos_start=self.pos)) self.advance() elif self.current_char == '"': tokens.append(self.make_string()) elif self.current_char in ';\n': # 换行 tokens.append(Token(TT_NEWLINE, pos_start=self.pos)) self.advance() else: # 没有匹配任何Token,return some error pos_start = self.pos.copy() char = self.current_char self.advance() return [], IllegalCharError(pos_start, self.pos, f"'{char}'") tokens.append(Token(TT_EOF, pos_start=self.pos)) return tokens, None def skip_comment(self): # 跳过toypl中的注释 self.advance() while self.current_char != '\n': self.advance() self.advance() def make_number(self): """ 识别数字 :return: """ num_str = '' dot_coumt = 0 # 点的个数 => . 小数点 pos_start = self.pos.copy() # 拷贝,避免影响原self.pos while self.current_char != None and self.current_char in DIGITS + '.': if self.current_char == '.': if dot_coumt == 1: break # 只可有一个小数点 dot_coumt += 1 num_str += '.' else: num_str += self.current_char self.advance() if dot_coumt == 0: # 整数 return Token(TT_INT, int(num_str), pos_start, self.pos) else: return Token(TT_FLOAT, float(num_str), pos_start, self.pos) def make_string(self): string = '' pos_start = self.pos.copy() escape_character = False # 是否为转义字符 => \" escape_characters = {'n': '\n', 't': '\t'} self.advance() # 当前字符不为空 以及 (不为 " 或者是转义字符串 => \") while self.current_char != None and (self.current_char != '"' or escape_character): if escape_character: # 如果是转义字符,则需要获得字符原始的值 string += escape_characters.get(self.current_char, self.current_char) escape_character = False else: if self.current_char == '\\': # python 中 \\ 其实就是当个 \ escape_character = True # 为转义字符 else: string += self.current_char # 普通字符,直接拼接则可 self.advance() self.advance() return Token(TT_STRING, string, pos_start, self.pos) def make_identifier(self): """ 识别变量 :return: """ variable_str = '' pos_start = self.pos.copy() while self.current_char != None and self.current_char in LETTERS_DIGITS + '_': # 运行变量名中存在下划线 variable_str += self.current_char self.advance() # 如果字符串在KEYWORDS中,说明该Token是关键字,否则则是变量名 if variable_str in KEYWORDS: tok_type = TT_KEYWORD else: tok_type = TT_IDENTIFIER return Token(tok_type, variable_str, pos_start, self.pos) def make_not_equals(self): """ 匹配 != :return: """ pos_start = self.pos.copy() self.advance() if self.current_char == '=': # != 不等于 self.advance() return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None self.advance() return None, ExpectedCharError(pos_start, self.pos, "'=' (after '!')") def make_equals(self): """ 匹配 = 或 == :return: """ tok_type = TT_EQ pos_start = self.pos.copy() self.advance() if self.current_char == '=': # == self.advance() tok_type = TT_EE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_less_than(self): """ 匹配 < 或 <= :return: """ tok_type = TT_LT pos_start = self.pos.copy() self.advance() if self.current_char == '=': # <= self.advance() tok_type = TT_LTE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_greater_than(self): """ 匹配 > 或 >= :return: """ tok_type = TT_GT pos_start = self.pos.copy() self.advance() if self.current_char == '=': # >= self.advance() tok_type = TT_GTE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_minus_or_arrow(self): """ 匹配 - 或 -> :return: """ tok_type = TT_MINUS pos_start = self.pos.copy() self.advance() if self.current_char == '>': self.advance() tok_type = TT_ARROW return Token(tok_type, pos_start=pos_start, pos_end=self.pos)
class Lexer: """ Lexer class that handles lexical analysis of the source code. Attributes: index: int, Current index in source code string row: int, Current row in source code column: int, Current column in source code filename: string, Name of the file currently being lexed source_code: string, The source code current_indent_lvl: int, The number of indentations at current position in code tokens: list, A list of all lexed tokens """ def __init__(self, source_code="", filename="CLI"): """ Inits class with source code and filename, init a Position object and gets first character. """ self.error = None if not isinstance(source_code, str): self.error = Error("Error: expected 'str' as source_code") return if not isinstance(filename, str): self.error = Error("Error: expected 'str' as filename") return self.filename = filename self.source_code = source_code self.position = Position(-1, 0, -1, 0, filename) self.current_character = None self.tokens = [] self.advance() def advance(self, n=1): """ Advances current_token to next character in source code. Changes Position-object to new postion in code. """ for i in range(n): self.position.advance(self.current_character) if self.position.index < len(self.source_code): self.current_character = self.source_code[self.position.index] else: self.current_character = None return False return True def look_ahead(self, count=1): """ Looks ahead on comming characters in the source code. PARAMS: count: int - Specifies how many characters to fetch RETURNS: str: the following characters """ if not isinstance(count, int): self.error = Error("Error: count is expected to be an int") return None if count < 1: self.error = Error( "Error: count is expected be a positive integer") return None sc_length = len(self.source_code) next_char = self.position.index + 1 if sc_length < next_char + count: return None return self.source_code[next_char:next_char + count] def allowed_character(self, allowed_characters): if self.current_character is None: self.error = Error("Error: Unexpected end of source code.") return False if allowed_characters is None or allowed_characters == "": self.error = Error("Error: No characters to allow entered.") return False elif self.current_character.lower() in allowed_characters.lower(): return True return False def make_tokens(self): """ Preforms the lexical analysis on the source code and breaks it down to terminal tokens. RETURNS: list, containing token-objects """ while self.current_character: if self.allowed_character("0123456789"): self.tokens.append(self.make_number()) if self.error: return elif "{}".format(self.current_character) == "\n": start = self.position.copy() self.advance() self.tokens.append( Token(tt._NEWLINE, '\n', start, self.position.copy())) indent = self.check_indent() if self.error: return if indent != self.position.indent: self.change_indent(indent) elif self.allowed_character("'\""): self.tokens.append(self.make_string()) if self.error: return continue elif self.is_operator(): self.tokens.append(self.make_operator()) if self.error: return continue else: letterResult, error = isLetter(self.current_character) if error: self.error = error return if letterResult: self.tokens.append(self.make_symbol()) continue else: if self.allowed_character(" \t"): self.advance() continue start = self.position.copy() char = self.current_character self.advance() end = self.position.copy() self.tokens.append(Token(tt._INVALID, char, start, end)) self.error = Error("ValueError: Unexpected character") if self.error: return def make_number(self): """ Reads characters from source code and returns a number token. The method can parse integers, floats, octadecimal, hexdecimal and binary numbers. """ if self.current_character == '0': next_character = self.look_ahead() if next_character: next_character = next_character.lower() if next_character == 'b': return self.make_binary() elif next_character == 'o': return self.make_octodecimal() elif next_character == 'x': return self.make_hexadecimal() return self.make_decimal() def make_binary(self): """ Reads binary characters until not allowed character appers. Returns a binary token """ binary_string = "" allowed_chars = "01" start_position = self.position.copy() end_position = None binary_string += self.current_character self.advance() binary_string += self.current_character self.advance() if binary_string.lower() != '0b': end_position = self.position.copy() self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, binary_string, start_position, end_position) while self.current_character and self.allowed_character(allowed_chars): binary_string += self.current_character self.advance() end_position = self.position.copy() if len(binary_string) < 3: self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, binary_string, start_position, end_position) return Token(tt._BIN, int(binary_string, base=2), start_position, end_position) def make_octodecimal(self): """ Reads octodecimal characters until not allowed character appers. Returns a octodecimal token """ oct_string = "" allowed_chars = "01234567" start_position = self.position.copy() end_position = None oct_string += self.current_character self.advance() oct_string += self.current_character self.advance() if oct_string.lower() != '0o': end_position = self.position.copy() self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, oct_string, start_position, end_position) while self.current_character and self.allowed_character(allowed_chars): oct_string += self.current_character self.advance() end_position = self.position.copy() if len(oct_string) < 3: self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, oct_string, start_position, end_position) return Token(tt._OCT, int(oct_string, base=8), start_position, end_position) def make_hexadecimal(self): """ Reads hexadecimal characters until not allowed character appers. Returns a hexdecimal token """ hex_string = "" allowed_chars = "0123456789abcdef" start_position = self.position.copy() end_position = None hex_string += self.current_character self.advance() hex_string += self.current_character self.advance() if hex_string.lower() != '0x': end_position = self.position.copy() self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, hex_string, start_position, end_position) while self.current_character and self.allowed_character(allowed_chars): hex_string += self.current_character self.advance() end_position = self.position.copy() if len(hex_string) < 3: self.error = Error("ValueError: Can not convert to a number") return Token(tt._INVALID, hex_string, start_position, end_position) return Token(tt._HEX, int(hex_string, base=16), start_position, end_position) def make_decimal(self): """ Reads decimal characters until not allowed character appers. Returns a integer or float token """ number_string = "" dot_counter = 0 allowed_chars = "1234567890." start_position = self.position.copy() end_position = None if self.current_character not in allowed_chars: self.error = Error("ValueError: Expected a digit or dot '.'") char = self.current_character self.advance() end_position = self.position.copy() return Token(tt._INVALID, char, start_position, end_position) while self.allowed_character(allowed_chars) and dot_counter < 2: number_string += self.current_character self.advance() if self.current_character == ".": dot_counter += 1 elif self.current_character is None: break end_position = self.position.copy() if dot_counter: return Token(tt._FLOAT, float(number_string), start_position, end_position) else: return Token(tt._INT, int(number_string), start_position, end_position) def make_symbol(self): """ Creates identifiers and keywords. Reads characters from the source code until getting to a non allowed character, decides if it's a keyword or identifier and returns a token. RETURNS: Token """ allowed_chars = "1234567890_abcdefghijklmnopqrstuvwxyz" symbol = "" start = self.position.copy() if not self.allowed_character(allowed_chars[10:]): symbol = self.current_character self.advance() end = self.position.copy() self.error = Error( "ValueError: Unexpected illegal character {}".format(symbol)) return Token(tt._INVALID, symbol, start, end) while self.allowed_character(allowed_chars): symbol += self.current_character if not self.advance(): break end = self.position.copy() symbol_type, error = isKeyword(symbol) if error: self.error = error return Token(tt._INVALID, symbol, start, end) return Token(symbol_type, symbol, start, end) def make_string(self): start = self.position.copy() qm = self.current_character not_allowed_chars = qm + "\n" string = str() prev = None while self.advance() and not self.allowed_character(not_allowed_chars): if self.current_character == '\\': next = self.look_ahead() if next == '\n': self.advance() continue elif next == "\"\'": string += self.current_character prev = self.current_character elif next == 'n': self.advance() string += '\n' prev = '\n' continue elif next == 't': self.advance() string += '\t' prev = '\t' continue elif next == '\\': self.advance() string += self.current_character prev = self.current_character if self.current_character == qm: self.advance() end = self.position.copy() return Token(tt._STRING, string, start, end) elif self.current_character == "\n": self.advance() end = self.position.copy() self.error = Error("StringError: Incorrect line break in string") return Token(tt._INVALID, string, start, end) def check_indent(self): """ Checks indentation level. Returns int - Level of indentation """ if self.current_character == "\n": self.advance() elif self.current_character != " ": start = self.position.copy() char = self.current_character self.advance() end = self.position.copy() self.error = Error("IndentationError: Unexpected character") self.tokens.append(Token(tt._INVALID, char, start, end)) return None count = 0 start, end = None, None while self.current_character == " ": if count % 4 == 0: start = self.position.copy() count += 1 self.advance() if count % 4 == 0: return int(count / 4) else: self.error = Error("IndentationError: Invalid indentation") end = self.position.copy() self.tokens.append( Token(tt._INVALID, " " * (count % 4), start, end)) return None def change_indent(self, indent): """ Generates indent and dedent tokens to change indentation level. """ if not isinstance(indent, int): self.error = Error("ValueError: Positive integer expected") return if indent < 0: self.error = Error("ValueError: Positive integer expected") return while self.position.indent < indent: self.position.indent += 1 self.tokens.append( Token(tt._INDENT, " ", self.position, self.position)) while self.position.indent > indent: self.position.indent -= 1 self.tokens.append( Token(tt._DEDENT, " ", self.position, self.position)) def is_operator(self): first_char_in_op = [ '=', '+', '-', '*', '/', '%', '&', '|', '^', '<', '>', '(', ')', '[', ']', '{', '}', '.', ',', ':', ] if self.current_character in first_char_in_op: return True return False def make_operator(self): n = self.look_ahead(2) if not n: n = self.look_ahead() possible_op = self.current_character if possible_op is None: self.error = Error("LexicalError: No characters in buffer") return Token(tt._INVALID, None, None, None) if n: possible_op += n start = self.position.copy() values = { '=': tt._ASSIGN, '==': tt._BITWISE_EQ, '+': tt._PLUS, '++': tt._INCR, '+=': tt._PLUS_ASSIGN, '-': tt._MINUS, '--': tt._DECR, '-=': tt._MINUS_ASSIGN, '*': tt._MULT, '*=': tt._MULT_ASSIGN, '**': tt._EXP, '**=': tt._POWER_ASSIGN, '/': tt._DIV, '/=': tt._DIV_ASSIGN, '//': tt._FLOOR, '//=': tt._FLOOR_ASSIGN, '%': tt._MOD, '%=': tt._MOD_ASSIGN, '&=': tt._AND_ASSIGN, '&': tt._BITWISE_AND, '|=': tt._OR_ASSIGN, '|': tt._BITWISE_OR, '^': tt._BITWISE_XOR, '^=': tt._XOR_ASSIGN, '<': tt._BITWISE_LT, '<=': tt._BITWISE_LTE, '<<': tt._BITWISE_LSHIFT, '<<=': tt._LSHIFT_ASSIGN, '>': tt._BITWISE_GT, '>=': tt._BITWISE_GTE, '>>': tt._BITWISE_RSHIFT, '>>=': tt._RSHIFT_ASSIGN, '(': tt._LPARAN, ')': tt._RPARAN, '[': tt._LSQBRACK, ']': tt._RSQBRACK, '{': tt._LCURLBRACK, '}': tt._RCURLBRACK, '.': tt._DOT, ',': tt._COMMA, ':': tt._COLON, } if possible_op.__len__() == 3 and values.get(possible_op): self.advance(3) end = self.position.copy() return Token(values.get(possible_op), possible_op, start, end) elif possible_op.__len__() >= 2 and values.get(possible_op[:2]): self.advance(2) end = self.position.copy() return Token(values.get(possible_op[:2]), possible_op[:2], start, end) elif values.get(self.current_character): char = self.current_character self.advance() end = self.position.copy() return Token(values.get(char), char, start, end) else: self.error = Error("ValueError: Token not a operator") char = self.current_character self.advance() end = self.position.copy() return Token(tt._INVALID, char, start, end)
class Lexer: def __init__(self, file_name, text): self.file_name = file_name self.text = text self.position = Position(-1, 0, -1, file_name, text) self.current_char = None self.advance() def advance(self): self.position.advance(self.current_char) self.current_char = self.text[self.position.index] if self.position.index < len(self.text) else None def make_tokens(self): tokens = [] while self.current_char != None: if self.current_char in ' \t': self.advance() elif self.current_char in DIGITS: tokens.append(self.make_number()) elif self.current_char == '+': tokens.append(Token(TOKENTYPE_PLUS, position_start=self.position)) self.advance() elif self.current_char == '-': tokens.append(Token(TOKENTYPE_MINUS, position_start=self.position)) self.advance() elif self.current_char == '*': tokens.append(Token(TOKENTYPE_MUL, position_start=self.position)) self.advance() elif self.current_char == '/': tokens.append(Token(TOKENTYPE_DIV, position_start=self.position)) self.advance() elif self.current_char == '(': tokens.append(Token(TOKENTYPE_LEFTPARENTESIS, position_start=self.position)) self.advance() elif self.current_char == ')': tokens.append(Token(TOKENTYPE_RIGHTPARENTESIS, position_start=self.position)) self.advance() else: position_start = self.position.copy() char = self.current_char self.advance() return[], IllegalCharError(position_start, self.position, ">>" + char + "<<") tokens.append(Token(TOKENTYPE_EOF, position_start=self.position)) return tokens, None def make_number(self): num_str = '' dot_count = 0 position_start = self.position.copy() while self.current_char != None and self.current_char in DIGITS + '.': if self.current_char == '.': if dot_count == 1: break dot_count += 1 num_str += '.' else: num_str += self.current_char self.advance() if dot_count == 0: return Token(TOKENTYPE_INT, int(num_str), position_start, self.position) else: return Token(TOKENTYPE_FLOAT, float(num_str), position_start, self.position)
class Lexer: def __init__(self, fn, text): self.fn = fn self.text = text self.pos = Position(-1, 0, -1, fn, text) self.current_char = None self.advance() def advance(self): self.pos.advance(self.current_char) self.current_char = self.text[self.pos.idx] if self.pos.idx < len( self.text) else None def make_tokens(self): tokens = [] while self.current_char != None: cc = self.current_char if cc in ' \t': self.advance() elif cc in DIGITS: tokens.append(self.make_number()) elif cc == '+': tokens.append(Token(TT_PLUS, pos_start=self.pos)) self.advance() elif cc == '-': tokens.append(Token(TT_MINUS, pos_start=self.pos)) self.advance() elif cc == '*': tokens.append(Token(TT_MUL, pos_start=self.pos)) self.advance() elif cc == '/': tokens.append(Token(TT_DIV, pos_start=self.pos)) self.advance() elif cc == '^': tokens.append(Token(TT_POW, pos_start=self.pos)) self.advance() elif cc == '(': tokens.append(Token(TT_LPAREN, pos_start=self.pos)) self.advance() elif cc == ')': tokens.append(Token(TT_RPAREN, pos_start=self.pos)) self.advance() else: pos_start = self.pos.copy() self.advance() return [], IllegalCharError(pos_start, self.pos, "'" + cc + "'") tokens.append(Token(TT_EOF, pos_start=self.pos)) return tokens, None def make_number(self): num_str = '' dot_count = 0 pos_start = self.pos.copy() while self.current_char != None and self.current_char in DIGITS + '.': if self.current_char == '.': if dot_count == 1: break dot_count += 1 num_str += '.' else: num_str += self.current_char self.advance() if dot_count == 0: return Token(TT_INT, int(num_str), pos_start, self.pos) else: return Token(TT_FLOAT, float(num_str), pos_start, self.pos)
class Lexer: # :string: -> :string: def __init__(self, file_name, lines): self.lines = lines self.file_name = file_name self.pos = Position(-1, 0, -1, file_name, lines) self.cur_symbol = None self.advance() def advance(self): self.pos.advance(self.cur_symbol) next_symbol = None if self.pos.line < len(self.lines): if self.pos.column < len(self.lines[self.pos.line]): next_symbol = self.lines[self.pos.line][self.pos.column] self.cur_symbol = next_symbol def generate_tokens(self): all_tokens = [] # Tokens for entire file tokens = [] # Tokens for a line while self.cur_symbol != None: if self.cur_symbol in IRRELEVENT_SYMBOLS: self.advance() elif self.cur_symbol in DIGITS + '.': tokens.append(self.generate_number()) elif self.cur_symbol in LETTERS: # Gaurantees first symbol has to be a letter. The rest can be any. tokens.append(self.generate_word( )) # Either keyword, variable, boolean or word oeprator. elif self.cur_symbol == '+': tokens.append(Token(TT_PLUS, start_pos=self.pos)) self.advance() elif self.cur_symbol == '-': tokens.append(Token(TT_MINUS, start_pos=self.pos)) self.advance() elif self.cur_symbol == '*': tokens.append(Token(TT_MULT, start_pos=self.pos)) self.advance() elif self.cur_symbol == '/': tokens.append(self.generate_compare(TT_NE, TT_DIV)) elif self.cur_symbol == '^': tokens.append(Token(TT_EXP, start_pos=self.pos)) self.advance() elif self.cur_symbol == '%': tokens.append(Token(TT_MOD, start_pos=self.pos)) self.advance() elif self.cur_symbol == '(': tokens.append(Token(TT_L_PAREN, start_pos=self.pos)) self.advance() elif self.cur_symbol == ')': tokens.append(Token(TT_R_PAREN, start_pos=self.pos)) self.advance() elif self.cur_symbol == '{': tokens.append(Token(TT_L_C_BRACK, start_pos=self.pos)) self.advance() elif self.cur_symbol == '}': tokens.append(Token(TT_R_C_BRACK, start_pos=self.pos)) self.advance() elif self.cur_symbol == '=': tokens.append(self.generate_compare(TT_EQT, TT_EQ)) elif self.cur_symbol == '<': tokens.append(self.generate_compare(TT_LTE, TT_LT)) elif self.cur_symbol == '>': tokens.append(self.generate_compare(TT_GTE, TT_GT)) elif self.cur_symbol == ',': tokens.append(Token(TT_COMMA, start_pos=self.pos)) self.advance() elif self.cur_symbol in ('\n', '#'): # Store tokens for the previous line when new line appears. # If there are no tokens for the new line, then do not append # any empty lists. # The '#' is for comments on the code if len(tokens) > 0: all_tokens.append(tokens) tokens = [] self.advance() else: pos_begin = self.pos.copy() illegal_symbol = self.cur_symbol self.advance() return ([], IllegalSymbolError(pos_begin, self.pos, "'" + illegal_symbol + "'")) # Edge case for when there isn't a new line at the end of the # program. if len(tokens) > 0: all_tokens.append(tokens) all_tokens.append([Token(TT_EOF, start_pos=self.pos)]) return (all_tokens, None) def generate_number(self): number_str = "" decimal_cnt = 0 start_pos = self.pos.copy() while self.cur_symbol != None and self.cur_symbol in DIGITS + '.': if self.cur_symbol == '.': decimal_cnt += 1 if decimal_cnt > 1: break else: number_str += self.cur_symbol self.advance() if decimal_cnt == 0: return Token(TT_INT, int(number_str), start_pos, self.pos) return Token(TT_FLOAT, float(number_str), start_pos, self.pos) def generate_word(self): id_str = "" pos_start = self.pos.copy() while self.cur_symbol != None and self.cur_symbol in LETTERS_DIGITS + '_': id_str += self.cur_symbol self.advance() if id_str in KEYWORDS: tok_type = TT_KEYWORD elif id_str in BOOLEANS: tok_type = TT_BOOL id_str = True if id_str == "True" else False elif id_str in WORD_OPERATOR: if id_str == "and": tok_type = TT_AND elif id_str == "or": tok_type = TT_OR else: tok_type = TT_NOT else: tok_type = TT_ID return Token(tok_type, id_str, pos_start, self.pos) def generate_compare(self, cmp_tok_a, cmp_tok_b): start_pos = self.pos.copy() self.advance() if self.cur_symbol == '=': self.advance() return Token(cmp_tok_a, start_pos=start_pos, end_pos=self.pos) return Token(cmp_tok_b, start_pos=start_pos, end_pos=self.pos)
class Lexer: def __init__(self, text: str, file_name: str): self.text = text self.pos = Position(-1, 0, -1, file_name, text) self.current_char: Optional[str] = None def advance(self) -> None: self.pos.advance(self.current_char) self.current_char = (self.text[self.pos.index] if self.pos.index < len(self.text) else None) def make_tokens(self) -> List[Token]: comment = False tokens: List[Token] = [] self.advance() while self.current_char is not None: if comment: if self.current_char in "\n\r": comment = False else: self.advance() continue if self.current_char == "+": tokens.append(EmptyToken(TT_PLUS, self.pos, self.pos)) elif self.current_char == "-": tokens.append(self.make_minus_or_arrow()) continue elif self.current_char == "#": comment = True elif self.current_char == "*": tokens.append(EmptyToken(TT_MUL, self.pos, self.pos)) elif self.current_char == "^": tokens.append(EmptyToken(TT_POW, self.pos, self.pos)) elif self.current_char == "/": tokens.append(EmptyToken(TT_DIV, self.pos, self.pos)) elif self.current_char == "(": tokens.append(EmptyToken(TT_LPAREN, self.pos, self.pos)) elif self.current_char == ")": tokens.append(EmptyToken(TT_RPAREN, self.pos, self.pos)) elif self.current_char == "[": tokens.append(EmptyToken(TT_LBRACKET, self.pos, self.pos)) elif self.current_char == "]": tokens.append(EmptyToken(TT_RBRACKET, self.pos, self.pos)) elif self.current_char == "{": tokens.append(EmptyToken(TT_LCURLY, self.pos, self.pos)) elif self.current_char == "}": tokens.append(EmptyToken(TT_RCURLY, self.pos, self.pos)) elif self.current_char == ";": tokens.append(EmptyToken(TT_SEMICOLON, self.pos, self.pos)) elif self.current_char == "|": tokens.append( StringToken(TT_KEYWORD, self.pos, self.pos, KEYWORDS["MATCH_OR"])) elif self.current_char == ",": tokens.append(EmptyToken(TT_COMA, self.pos, self.pos)) elif self.current_char == ":": tokens.append(EmptyToken(TT_COLON, self.pos, self.pos)) elif self.current_char == "=": tokens.append(self.make_equals()) continue elif self.current_char == "<": tokens.append(self.make_less_than()) continue elif self.current_char == ">": tokens.append(self.make_greater_than()) continue elif self.current_char == "!": token, error = self.make_not_equals() if error or token is None: return [] tokens.append(token) elif self.current_char.isdigit(): tokens.append(self.make_number()) continue elif self.current_char == "_" or (self.current_char.isalnum() and not self.current_char.isdigit()): tokens.append(self.make_identifier()) continue elif not self.current_char.isspace(): post_start = self.pos.copy() char = self.current_char self.advance() raise IllegalCharacterError(post_start, self.pos, char) self.advance() tokens.append(EmptyToken(TT_EOF, self.pos, self.pos)) return tokens def make_number(self) -> Token: num = "" dot_count = 0 pos_start = self.pos.copy() while self.current_char is not None: if self.current_char.isdigit(): pass elif self.current_char == ".": if dot_count == 0: dot_count += 1 else: break else: break num += self.current_char self.advance() if dot_count == 0: return NumberToken(TT_INT, pos_start, self.pos, int(num)) return NumberToken(TT_FLOAT, pos_start, self.pos, float(num)) def make_identifier(self) -> Token: id_str = "" pos_start = self.pos.copy() while self.current_char and (self.current_char.isalnum() or self.current_char == "_"): id_str += self.current_char self.advance() tok_type = TT_KEYWORD if id_str in KEYWORDS.values() else TT_IDENTIFIER return StringToken(tok_type, pos_start, self.pos, id_str) def make_not_equals(self) -> Tuple[Optional[Token], Optional[Error]]: pos_start = self.pos.copy() self.advance() if self.current_char == "=": self.advance() return EmptyToken(TT_NE, pos_start, self.pos), None return ( None, UnexpectedCharError(pos_start, self.pos, 'after "!" should be "="'), ) def make_equals(self) -> Token: token_type = TT_EQUALS pos_start = self.pos.copy() self.advance() if self.current_char == "=": self.advance() token_type = TT_EE return EmptyToken(token_type, pos_start, self.pos) def make_greater_than(self) -> Token: token_type = TT_GT pos_start = self.pos.copy() self.advance() if self.current_char == "=": self.advance() token_type = TT_GTE return EmptyToken(token_type, pos_start, self.pos) def make_less_than(self) -> Token: token_type = TT_LT pos_start = self.pos.copy() self.advance() if self.current_char == "=": self.advance() token_type = TT_LTE return EmptyToken(token_type, pos_start, self.pos) def make_minus_or_arrow(self) -> Token: token_type = TT_MINUS pos_start = self.pos.copy() self.advance() if self.current_char == ">": self.advance() token_type = TT_ARROW return EmptyToken(token_type, pos_start, self.pos)
class Lexer: def __init__(self, fn, text): self.fn = fn self.text = text self.pos = Position(-1, 0, -1, fn, text) self.current_char = None self.advance() def advance(self): self.pos.advance(self.current_char) self.current_char = self.text[self.pos.idx] if self.pos.idx < len( self.text) else None def make_tokens(self): tokens = [] while self.current_char != None: if self.current_char in ' \t': self.advance() elif self.current_char in DIGITS: tokens.append(self.make_number()) elif self.current_char in LETTERS: tokens.append(self.make_indentifier()) elif self.current_char == '+': tokens.append(Token(TT_PLUS, pos_start=self.pos)) self.advance() elif self.current_char == '-': tokens.append(self.make_minus_or_arrow()) self.advance() elif self.current_char == '*': tokens.append(Token(TT_MUL, pos_start=self.pos)) self.advance() elif self.current_char == '/': tokens.append(Token(TT_DIV, pos_start=self.pos)) self.advance() elif self.current_char == '^': tokens.append(Token(TT_POWER, pos_start=self.pos)) self.advance() elif self.current_char == '(': tokens.append(Token(TT_LPAREN, pos_start=self.pos)) self.advance() elif self.current_char == ')': tokens.append(Token(TT_RPAREN, pos_start=self.pos)) self.advance() elif self.current_char == '!': tok, error = self.make_not_equals() if error: return [], Error tokens.append(tok) elif self.current_char == '=': tokens.append(self.make_equals()) elif self.current_char == '<': tokens.append(self.make_less_than()) elif self.current_char == '>': tokens.append(self.make_greater_than()) elif self.current_char == ',': tokens.append(Token(TT_COMMA, pos_start=self.pos)) self.advance() else: pos_start = self.pos.copy() char = self.current_char self.advance() return [], IllegalCharError(pos_start, self.pos, "'" + char + "'") tokens.append(Token(TT_EOF, pos_start=self.pos)) return tokens, None def make_indentifier(self): id_str = '' pos_start = self.pos.copy() while self.current_char != None and self.current_char in LETTERS_DIGITS + '_': id_str += self.current_char self.advance() tok_type = TT_KEYWORD if id_str in KEYWORDS else TT_IDENTIFIER return Token(tok_type, id_str, pos_start, self.pos) def make_minus_or_arrow(self): tok_type = TT_MINUS pos_start = self.pos.copy() self.advance() if self.current_char == '>': self.advance() tok_type = TT_ARROW return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_not_equals(self): pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None return None, ExpectedCharError(pos_start, self.pos, "'=' expected after '!'") def make_equals(self): tok_type = TT_EQ pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() tok_type = TT_EE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_less_than(self): tok_type = TT_LT pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() tok_type = TT_LTE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_greater_than(self): tok_type = TT_GT pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() tok_type = TT_GTE return Token(tok_type, pos_start=pos_start, pos_end=self.pos) def make_number(self): num_str = '' dot_count = 0 pos_start = self.pos.copy() while self.current_char != None and self.current_char in DIGITS + '.': if self.current_char == '.': if dot_count == 1: break dot_count += 1 num_str += '.' else: num_str += self.current_char self.advance() if dot_count == 0: return Token(TT_INT, int(num_str), pos_start, self.pos) else: return Token(TT_FLOAT, float(num_str), pos_start, self.pos)