def buildMultichar(self) -> Token: op = self.ch begin = self.loc.copy() self.advance() if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS): return Token(op, op, begin, self.loc.copy()) op += self.ch self.advance() if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS): return Token(op, op, begin, self.loc.copy()) op += self.ch self.advance() if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS): return Token(op, op, begin, self.loc.copy()) op += self.ch self.advance() return Token(op, op, begin, self.loc.copy())
def buildIncluder(self) -> Token: self.advance() begin = self.loc.copy() if (self.ch == ">"): self.advance() return Token(T.T_STRING, "", begin, self.loc.copy()) #content = self.ch #ch = self.advance() end = self.raw.find(">", self.chidx) if (end == -1 or end >= len(self.raw)): raise (TokenMismatch(Token("<", ">", begin, begin))) content = self.raw[self.chidx:end] self.chidx = end self.advance() return Token(T.T_STRING, content, begin, self.loc.copy())
def __init__(self, ly, out): self.ly = ly self.out = out self.script = [] self.listToken = [] # This variable stores the current token being analysed self.currentToken = Token('', '')
def buildAmbiguous(self) -> Token: begin = self.loc.copy() self.advance() raw = self.raw chidx = self.chidx # r"\W", flags=re.ASCII end = ambiguous_regex.search(raw, chidx).end() - 1 value = (raw[chidx - 1:end]) lv = end - (chidx - 1) - 2 self.chidx += lv self.loc.ch += lv self.advance() if (value in T.KEYWORDS): return Token(T.T_KEYWORD, value, begin, self.loc.copy()) return Token(T.T_ID, value, begin, self.loc.copy())
def buildChar(self) -> Token: # build a char token with one char self.advance() begin = self.loc.copy() v = ord(self.ch) # check for EOF if self.size - self.chidx < 3: raise (UnexepectedEOFError( Token(T.T_CHAR, self.ch, begin, self.loc))) # handle escapes if self.ch == "\\": self.advance() v = ord(eval(f""" '\\{self.ch}' """)) self.advance() # check for proper closing if self.ch != "'": raise (TokenMismatch(Token(T.T_CHAR, v, begin, self.loc))) self.advance() return Token(T.T_CHAR, v, begin, self.loc.copy())
def buildString( self) -> Token: # build a string value with escape characters self.advance() begin = self.loc.copy() if (self.ch == "\""): self.advance() return Token(T.T_STRING, "", begin, self.loc.copy()) end = self.raw.find("\"", self.chidx) if (end == -1 or end >= len(self.raw)): raise (TokenMismatch(Token("\"", "\"", begin, begin))) content = self.raw[self.chidx:end] self.chidx = end # account for multi-line strings newlines = content.count("\n") self.loc.line += newlines - 1 if newlines else 0 self.loc.ch += len(content) self.advance() return Token(T.T_STRING, content, begin, self.loc.copy())
def buildNumber(self) -> Token: num = self.ch begin = self.loc.copy() self.advance() pchars = T.T_DIGITS + T.T_DOT + "e" base = 10 if (self.ch == "x" or self.ch == "X"): num += self.ch self.advance() pchars += "abcdefABCDEF" base = 16 while self.ch in pchars: num += self.ch self.advance() if ("e" in num): pchars += "-" if (self.ch == "b"): base = 2 self.advance() elif (self.ch == "q"): base = 8 self.advance() t = T.T_INT if (T.T_DOT in num): val = float(num) if self.ch == 'd': t = T.T_DOUBLE self.advance() elif self.ch == 'f': t = T.T_FLOAT self.advance() else: t = T.T_DOUBLE else: if self.ch == 'd': t = T.T_DOUBLE self.advance() val = float(int(num, base)) elif self.ch == 'f': t = T.T_FLOAT self.advance() val = float(int(num, base)) else: val = int(num, base) return Token(t, val, begin, self.loc.copy())
def getToken(self, strerr=None, plist=None): # Retrieves the next token in current line and stores in token attribute global keywords, symbols # RegExes: # ID: [a-zA-Z_]+[0-9a-zA-Z_]* # Comments: {.*} # Integers: [0-9]+ # Float: [0-9]+\.[0-9]+ # Strings: "[^"]+" # Format of symboltable [ KEY, (NAME, CLASS, TYPE, SCOPE, VALUE, LINE DECLARED, LINE REFERENCED) ] # Clear previous lines from memory linecache.clearcache() # Retrieve current line from file self.line = linecache.getline(self.file, self.lineNumber + 1) # EOF if not self.line: self.listToken.append(Token('EOF', 'EOF')) return self.listToken # Strip tab keys since they're not a problem in this language # self.line = self.line.replace('\t', ' ') if plist: self.listToken = list else: self.listToken = [] if strerr: auto = 'string_error' self.lineNumber -= 1 else: auto = 'begin' string = '' isString = False tmpSizeListToken = 0 state = 0 tmp = '' for c in self.line: string += c # Comment automaton if auto == 'comment': if c != '}': if c == '\n': if not isString: self.errorRef.lexerError('Linha ' + str(self.lineNumber + 2) + ': ' + 'comentario nao fechado') auto = 'begin' elif c == '}': auto = 'begin' continue # Number automaton if auto == 'number': if state == 1: if c.isdigit(): tmp += c continue else: if c == '.': state = 2 else: self.listToken.append(Token(tmp, 'numero_inteiro')) auto = 'begin' if state == 2: if c == '.': state = 3 continue if state == 3: if c.isdigit(): tmp += '.' state = 4 elif c == '.': self.listToken.append(Token(tmp, 'numero_inteiro')) self.listToken.append(Token('..', '..')) auto = 'begin' continue else: self.listToken.append(Token(tmp, 'numero_inteiro')) self.listToken.append(Token('.', '.')) auto = 'begin' if state == 4: if c.isdigit(): tmp += c else: self.listToken.append(Token(tmp, 'numero_real')) auto = 'begin' # Symbols automaton if auto == 'symbol': if tmp + c in self.symtable.table: tmp += c elif tmp in self.symtable.table: self.listToken.append(Token(tmp, tmp)) auto = 'begin' else: if not isString: self.errorRef.lexerError('Linha ' + str(self.lineNumber + 1) + ': ' + tmp + ' - simbolo nao identificado') auto = 'begin' # Names automaton if auto == 'names': if state == 1 and (c.isalnum() or c == '_'): tmp += c state = 2 elif state == 1 and (not c.isalnum() or c != '_'): state = 2 elif state == 2 and (c.isalnum() or c == '_'): tmp += c if state == 2 and not (c.isalnum() or c == '_'): if tmp in self.symtable.table: if self.symtable.table[tmp][ 'token'] == 'identificador': self.listToken.append(Token(tmp, 'identificador')) else: self.listToken.append(Token(tmp, tmp)) else: #self.symtable.insertSymbol(tmp, (tmp, 'variavel', 'identificador', self.scope, 'null', self.lineNumber, self.lineNumber)) self.listToken.append(Token(tmp, 'identificador')) auto = 'begin' # Checks which automaton to enter if auto == 'begin': tmp = '' state = 1 if c == '\t' or c == '\n' or c == ' ' or c == '\r': continue elif c == '"': tmp += c auto = 'string' elif c == '{': auto = 'comment' elif c.isdigit(): tmp += c auto = 'number' elif not c.isalnum() and c != "_": tmp += c auto = 'symbol' elif c.isalpha() or c == '_': tmp += c auto = 'names' # String if auto == 'string': if not isString: isString = True tmpSizeListToken = len(self.listToken) string = c self.listToken.append( Token('Linha ' + str(self.lineNumber + 1) + ': ' + tmp, 'simbolo nao identificado', True)) self.listMessage.append('Linha ' + str(self.lineNumber + 1) + ': ' + tmp + ' - simbolo nao identificado') else: tmpSizeListToken = len(self.listToken) - tmpSizeListToken for i in range(tmpSizeListToken): self.listToken.pop() self.listMessage.pop() self.listToken.append(Token(string, 'cadeia_literal')) string = '' isString = False auto = 'begin' #End of analysis, increment lineNumber self.lineNumber += 1 # Check whether the error message list is empty or there is a message there indicating # that some string wasn't properly closed thus indicating that there's a error that must # be treated by the Error class. if self.listMessage: msg = str(self.listMessage.pop()) self.errorRef.lexerError(msg) return self.listToken
def getToken(self, line): # Retrieves the next token in current line and stores in token attribute """ :param line: """ global keywords, symbols # TODO - Automaton implementation for token recognition # RegExes: # ID: [a-zA-Z_]+[0-9a-zA-Z_]* # Comments: {.*} # Integers: [0-9]+ # Float: [0-9]+\.[0-9]+ # Strings: "[^"]+" self.lineNumber += 1 self.listToken = [] auto = "begin" state = 0 tmp = "" for c in line: # String automaton if auto == "string": if state == 1 and c != '"': if c == '\n': self.listToken.append( Token('Linha ' + str(self.lineNumber) + ': ', 'cadeia nao fechada', True)) auto = 'begin' continue tmp += c elif state == 1 and c == '"': state = 2 tmp += c if state == 2: self.listToken.append(Token(tmp, 'cadeia_literal')) auto = "begin" continue # Comment automaton if auto == "comment": if state == 1 and c != '}': if c == '\n': self.listToken.append( Token('Linha ' + str(self.lineNumber) + ': ', 'comentario nao fechado', True)) auto = 'begin' continue if state == 1 and c == '}': auto = 'begin' continue # Number automaton if auto == 'number': if state == 1 and c.isdigit(): tmp += c elif state == 1 and not c.isdigit(): if c == '.': state = 2 tmp += c elif c == '\n' or c == ' ': self.listToken.append(Token(tmp, 'numero_inteiro')) auto = 'begin' elif state == 2 and c.isdigit(): tmp += c elif state == 2 and not c.isdigit(): if c == '\n': self.listToken.append(Token(tmp, 'numero_real')) auto = 'begin' # Symbols if auto == 'symbol': if tmp + c in self.symtable.table: tmp += c elif tmp in self.symtable.table: self.listToken.append(Token(tmp, tmp)) auto = 'begin' else: self.listToken.append( Token('Linha ' + str(self.lineNumber) + ': ' + tmp, 'simbolo nao identificado')) auto = 'begin' # Names if auto == 'names': if state == 1 and (c.isalpha() or c == '_'): tmp += c state = 2 elif state == 2 and (c.isalnum() or c == '_'): tmp += c if state == 2 and not (c.isalnum() or c == '_'): if tmp in self.symtable.table: self.listToken.append(Token(tmp, tmp)) else: self.listToken.append(Token(tmp, 'identificador')) auto = 'begin' # Checks which automaton to enter if auto == "begin": tmp = "" state = 1 if c == '"': tmp += c auto = "string" elif c == '{': auto = "comment" elif c.isdigit(): tmp += c auto = 'number' elif not c.isalnum() and c != '\n' and c != ' ': tmp += c auto = 'symbol' elif (c.isalpha() or c == '_') and c != '\n' and c != ' ': tmp += c auto = 'names' return self.listToken
def lex(self, getDirectives=False) -> list: tokens = [] directives = [] advance = self.advance # a [1] character marks the end of the raw text while self.ch != chr(1): # newlines, spaces, and indents have no response if (self.ch == "\n" or self.ch in " \t\r"): advance() # backslash characters elif (self.ch == "\\"): tokens.append( Token(T.T_BSLASH, T.T_BSLASH, self.loc.copy(), self.loc.copy())) advance() # pre-compiler directives elif (self.ch == "#"): advance() while (self.ch == " "): advance() t = self.buildAmbiguous() t.tok = T.T_DIRECTIVE tokens.append(t) if (self.chidx < self.size - 1): if (self.raw[self.chidx + 1] == "<"): t2 = self.buildIncluder() tokens.append(t2) # typecasts elif (self.ch == "$"): advance() t = self.buildAmbiguous() t.tok = T.T_TYPECAST while self.ch == "*": advance() t.value += "." tokens.append(t) # semicolon elif (self.ch == ";"): tokens.append( Token(T.T_ENDL, T.T_ENDL, self.loc.copy(), self.loc.copy())) advance() elif (self.ch == "+"): tokens.append(self.buildMultichar()) elif (self.ch == "/"): advance() # managing comments: olchdx = self.chidx # single line comments: if (self.ch == "/"): # find and jump to next newline self.chidx = self.raw.find("\n", self.chidx) - 1 if self.chidx <= 0: raise (UnexepectedEOFError( Token('', '', self.loc, self.loc))) self.loc.ch += self.chidx - olchdx advance() # multiline comments: elif (self.ch == "*"): # find and jump to next instance of '*/' in raw text self.chidx = self.raw.find("*/", self.chidx) + 1 if self.chidx <= 0: raise (UnexepectedEOFError( Token('', '', self.loc, self.loc))) self.loc.ch += self.chidx - olchdx self.loc.line += self.raw.count("\n", olchdx, self.chidx) advance() # not a comment else: self.chidx -= 2 self.loc.ch -= 2 advance() tokens.append(self.buildMultichar()) elif (self.ch in "()}{[],@~?"): tokens.append(Token(self.ch, self.ch, self.loc.copy(), None)) advance() tokens[-1].end = self.loc.copy() elif (self.ch == "-"): advance() prev = tokens[-1] if prev.tok not in [ T.T_INT, T.T_CHAR, T.T_DOUBLE, T.T_ID, T.T_CLSP ] and self.ch.isdigit(): t = self.buildNumber() t.value = -t.value else: self.chidx -= 2 self.loc.ch -= 2 advance() t = self.buildMultichar() tokens.append(t) elif (self.ch in T.T_MULTIOP): token = self.buildMultichar() tokens.append(token) elif (self.ch == "."): if self.raw[self.chidx + 1].isdigit(): token = self.buildNumber() else: token = self.buildMultichar() tokens.append(token) elif self.ch.isdigit(): token = self.buildNumber() tokens.append(token) elif (self.ch == "\""): token = self.buildString() tokens.append(token) elif (self.ch == "'"): token = self.buildChar() tokens.append(token) elif (T.isidchar(ord(self.ch))): token = self.buildAmbiguous() tokens.append(token) else: print(ord(self.ch)) raise (UnkownCharSequence( Token(self.ch, self.ch, self.loc.copy(), self.loc.copy()))) tokens.append(Token(T.T_EOF, T.T_EOF, self.loc.copy(), self.loc.copy())) if (getDirectives): return directives return tokens