Ejemplo n.º 1
0
 def buildMultichar(self) -> Token:
     op = self.ch
     begin = self.loc.copy()
     self.advance()
     if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS):
         return Token(op, op, begin, self.loc.copy())
     op += self.ch
     self.advance()
     if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS):
         return Token(op, op, begin, self.loc.copy())
     op += self.ch
     self.advance()
     if (self.ch not in T.T_MULTIOP or op + self.ch not in T.MULTIOPERS):
         return Token(op, op, begin, self.loc.copy())
     op += self.ch
     self.advance()
     return Token(op, op, begin, self.loc.copy())
Ejemplo n.º 2
0
    def buildIncluder(self) -> Token:
        self.advance()
        begin = self.loc.copy()
        if (self.ch == ">"):
            self.advance()
            return Token(T.T_STRING, "", begin, self.loc.copy())
        #content = self.ch
        #ch = self.advance()

        end = self.raw.find(">", self.chidx)
        if (end == -1 or end >= len(self.raw)):
            raise (TokenMismatch(Token("<", ">", begin, begin)))
        content = self.raw[self.chidx:end]
        self.chidx = end

        self.advance()
        return Token(T.T_STRING, content, begin, self.loc.copy())
Ejemplo n.º 3
0
    def __init__(self, ly, out):
        self.ly = ly
        self.out = out
        self.script = []

        self.listToken = []

        # This variable stores the current token being analysed
        self.currentToken = Token('', '')
Ejemplo n.º 4
0
    def buildAmbiguous(self) -> Token:
        begin = self.loc.copy()
        self.advance()
        raw = self.raw
        chidx = self.chidx

        # r"\W", flags=re.ASCII
        end = ambiguous_regex.search(raw, chidx).end() - 1
        value = (raw[chidx - 1:end])
        lv = end - (chidx - 1) - 2
        self.chidx += lv
        self.loc.ch += lv
        self.advance()

        if (value in T.KEYWORDS):

            return Token(T.T_KEYWORD, value, begin, self.loc.copy())
        return Token(T.T_ID, value, begin, self.loc.copy())
Ejemplo n.º 5
0
    def buildChar(self) -> Token:  # build a char token with one char
        self.advance()
        begin = self.loc.copy()
        v = ord(self.ch)
        # check for EOF
        if self.size - self.chidx < 3:
            raise (UnexepectedEOFError(
                Token(T.T_CHAR, self.ch, begin, self.loc)))
        # handle escapes
        if self.ch == "\\":
            self.advance()
            v = ord(eval(f""" '\\{self.ch}' """))
        self.advance()
        # check for proper closing
        if self.ch != "'":
            raise (TokenMismatch(Token(T.T_CHAR, v, begin, self.loc)))

        self.advance()
        return Token(T.T_CHAR, v, begin, self.loc.copy())
Ejemplo n.º 6
0
    def buildString(
            self) -> Token:  # build a string value with escape characters
        self.advance()
        begin = self.loc.copy()
        if (self.ch == "\""):
            self.advance()
            return Token(T.T_STRING, "", begin, self.loc.copy())

        end = self.raw.find("\"", self.chidx)

        if (end == -1 or end >= len(self.raw)):
            raise (TokenMismatch(Token("\"", "\"", begin, begin)))
        content = self.raw[self.chidx:end]
        self.chidx = end
        # account for multi-line strings
        newlines = content.count("\n")
        self.loc.line += newlines - 1 if newlines else 0
        self.loc.ch += len(content)
        self.advance()
        return Token(T.T_STRING, content, begin, self.loc.copy())
Ejemplo n.º 7
0
    def buildNumber(self) -> Token:
        num = self.ch
        begin = self.loc.copy()
        self.advance()
        pchars = T.T_DIGITS + T.T_DOT + "e"
        base = 10
        if (self.ch == "x" or self.ch == "X"):
            num += self.ch
            self.advance()
            pchars += "abcdefABCDEF"
            base = 16

        while self.ch in pchars:
            num += self.ch
            self.advance()
            if ("e" in num):
                pchars += "-"

        if (self.ch == "b"):
            base = 2
            self.advance()
        elif (self.ch == "q"):
            base = 8
            self.advance()

        t = T.T_INT
        if (T.T_DOT in num):
            val = float(num)
            if self.ch == 'd':
                t = T.T_DOUBLE
                self.advance()
            elif self.ch == 'f':
                t = T.T_FLOAT
                self.advance()
            else:
                t = T.T_DOUBLE
        else:
            if self.ch == 'd':
                t = T.T_DOUBLE
                self.advance()
                val = float(int(num, base))
            elif self.ch == 'f':
                t = T.T_FLOAT
                self.advance()
                val = float(int(num, base))
            else:
                val = int(num, base)

        return Token(t, val, begin, self.loc.copy())
Ejemplo n.º 8
0
    def getToken(self, strerr=None, plist=None):
        # Retrieves the next token in current line and stores in token attribute

        global keywords, symbols
        # RegExes:
        # ID: [a-zA-Z_]+[0-9a-zA-Z_]*
        # Comments: {.*}
        # Integers: [0-9]+
        # Float: [0-9]+\.[0-9]+
        # Strings: "[^"]+"

        # Format of symboltable [ KEY, (NAME, CLASS, TYPE, SCOPE, VALUE, LINE DECLARED, LINE REFERENCED) ]

        # Clear previous lines from memory
        linecache.clearcache()

        # Retrieve current line from file
        self.line = linecache.getline(self.file, self.lineNumber + 1)

        # EOF
        if not self.line:
            self.listToken.append(Token('EOF', 'EOF'))
            return self.listToken

        # Strip tab keys since they're not a problem in this language
        # self.line = self.line.replace('\t', ' ')

        if plist:
            self.listToken = list
        else:
            self.listToken = []

        if strerr:
            auto = 'string_error'
            self.lineNumber -= 1
        else:
            auto = 'begin'

        string = ''
        isString = False
        tmpSizeListToken = 0

        state = 0
        tmp = ''

        for c in self.line:

            string += c

            # Comment automaton
            if auto == 'comment':
                if c != '}':
                    if c == '\n':
                        if not isString:
                            self.errorRef.lexerError('Linha ' +
                                                     str(self.lineNumber + 2) +
                                                     ': ' +
                                                     'comentario nao fechado')
                        auto = 'begin'
                elif c == '}':
                    auto = 'begin'
                    continue

            # Number automaton
            if auto == 'number':
                if state == 1:
                    if c.isdigit():
                        tmp += c
                        continue
                    else:
                        if c == '.':
                            state = 2
                        else:
                            self.listToken.append(Token(tmp, 'numero_inteiro'))
                            auto = 'begin'

                if state == 2:
                    if c == '.':
                        state = 3
                        continue

                if state == 3:
                    if c.isdigit():
                        tmp += '.'
                        state = 4
                    elif c == '.':
                        self.listToken.append(Token(tmp, 'numero_inteiro'))
                        self.listToken.append(Token('..', '..'))
                        auto = 'begin'
                        continue
                    else:
                        self.listToken.append(Token(tmp, 'numero_inteiro'))
                        self.listToken.append(Token('.', '.'))
                        auto = 'begin'

                if state == 4:
                    if c.isdigit():
                        tmp += c
                    else:
                        self.listToken.append(Token(tmp, 'numero_real'))
                        auto = 'begin'

            # Symbols automaton
            if auto == 'symbol':
                if tmp + c in self.symtable.table:
                    tmp += c
                elif tmp in self.symtable.table:
                    self.listToken.append(Token(tmp, tmp))
                    auto = 'begin'
                else:
                    if not isString:
                        self.errorRef.lexerError('Linha ' +
                                                 str(self.lineNumber + 1) +
                                                 ': ' + tmp +
                                                 ' - simbolo nao identificado')
                    auto = 'begin'

            # Names automaton
            if auto == 'names':
                if state == 1 and (c.isalnum() or c == '_'):
                    tmp += c
                    state = 2
                elif state == 1 and (not c.isalnum() or c != '_'):
                    state = 2
                elif state == 2 and (c.isalnum() or c == '_'):
                    tmp += c

                if state == 2 and not (c.isalnum() or c == '_'):
                    if tmp in self.symtable.table:
                        if self.symtable.table[tmp][
                                'token'] == 'identificador':
                            self.listToken.append(Token(tmp, 'identificador'))
                        else:
                            self.listToken.append(Token(tmp, tmp))
                    else:
                        #self.symtable.insertSymbol(tmp, (tmp, 'variavel', 'identificador', self.scope, 'null', self.lineNumber, self.lineNumber))
                        self.listToken.append(Token(tmp, 'identificador'))
                    auto = 'begin'

            # Checks which automaton to enter
            if auto == 'begin':
                tmp = ''
                state = 1
                if c == '\t' or c == '\n' or c == ' ' or c == '\r':
                    continue
                elif c == '"':
                    tmp += c
                    auto = 'string'
                elif c == '{':
                    auto = 'comment'
                elif c.isdigit():
                    tmp += c
                    auto = 'number'
                elif not c.isalnum() and c != "_":
                    tmp += c
                    auto = 'symbol'
                elif c.isalpha() or c == '_':
                    tmp += c
                    auto = 'names'

            # String
            if auto == 'string':
                if not isString:
                    isString = True
                    tmpSizeListToken = len(self.listToken)
                    string = c
                    self.listToken.append(
                        Token('Linha ' + str(self.lineNumber + 1) + ': ' + tmp,
                              'simbolo nao identificado', True))
                    self.listMessage.append('Linha ' +
                                            str(self.lineNumber + 1) + ': ' +
                                            tmp +
                                            ' - simbolo nao identificado')
                else:
                    tmpSizeListToken = len(self.listToken) - tmpSizeListToken
                    for i in range(tmpSizeListToken):
                        self.listToken.pop()

                    self.listMessage.pop()

                    self.listToken.append(Token(string, 'cadeia_literal'))
                    string = ''
                    isString = False
                auto = 'begin'

        #End of analysis, increment lineNumber
        self.lineNumber += 1

        # Check whether the error message list is empty or there is a message there indicating
        # that some string wasn't properly closed thus indicating that there's a error that must
        # be treated by the Error class.
        if self.listMessage:
            msg = str(self.listMessage.pop())
            self.errorRef.lexerError(msg)

        return self.listToken
Ejemplo n.º 9
0
    def getToken(self, line):
        # Retrieves the next token in current line and stores in token attribute
        """

        :param line:
        """
        global keywords, symbols
        # TODO - Automaton implementation for token recognition
        # RegExes:
        # ID: [a-zA-Z_]+[0-9a-zA-Z_]*
        # Comments: {.*}
        # Integers: [0-9]+
        # Float: [0-9]+\.[0-9]+
        # Strings: "[^"]+"

        self.lineNumber += 1
        self.listToken = []
        auto = "begin"
        state = 0
        tmp = ""

        for c in line:
            # String automaton
            if auto == "string":
                if state == 1 and c != '"':
                    if c == '\n':
                        self.listToken.append(
                            Token('Linha ' + str(self.lineNumber) + ': ',
                                  'cadeia nao fechada', True))
                        auto = 'begin'
                        continue
                    tmp += c
                elif state == 1 and c == '"':
                    state = 2
                    tmp += c

                if state == 2:
                    self.listToken.append(Token(tmp, 'cadeia_literal'))
                    auto = "begin"
                    continue

            # Comment automaton
            if auto == "comment":
                if state == 1 and c != '}':
                    if c == '\n':
                        self.listToken.append(
                            Token('Linha ' + str(self.lineNumber) + ': ',
                                  'comentario nao fechado', True))
                        auto = 'begin'
                        continue
                if state == 1 and c == '}':
                    auto = 'begin'
                    continue

            # Number automaton
            if auto == 'number':
                if state == 1 and c.isdigit():
                    tmp += c
                elif state == 1 and not c.isdigit():
                    if c == '.':
                        state = 2
                        tmp += c
                    elif c == '\n' or c == ' ':
                        self.listToken.append(Token(tmp, 'numero_inteiro'))
                        auto = 'begin'
                elif state == 2 and c.isdigit():
                    tmp += c
                elif state == 2 and not c.isdigit():
                    if c == '\n':
                        self.listToken.append(Token(tmp, 'numero_real'))
                        auto = 'begin'

            # Symbols
            if auto == 'symbol':
                if tmp + c in self.symtable.table:
                    tmp += c
                elif tmp in self.symtable.table:
                    self.listToken.append(Token(tmp, tmp))
                    auto = 'begin'
                else:
                    self.listToken.append(
                        Token('Linha ' + str(self.lineNumber) + ': ' + tmp,
                              'simbolo nao identificado'))
                    auto = 'begin'

            # Names
            if auto == 'names':
                if state == 1 and (c.isalpha() or c == '_'):
                    tmp += c
                    state = 2
                elif state == 2 and (c.isalnum() or c == '_'):
                    tmp += c
                if state == 2 and not (c.isalnum() or c == '_'):
                    if tmp in self.symtable.table:
                        self.listToken.append(Token(tmp, tmp))
                    else:
                        self.listToken.append(Token(tmp, 'identificador'))
                    auto = 'begin'

            # Checks which automaton to enter
            if auto == "begin":
                tmp = ""
                state = 1
                if c == '"':
                    tmp += c
                    auto = "string"
                elif c == '{':
                    auto = "comment"
                elif c.isdigit():
                    tmp += c
                    auto = 'number'
                elif not c.isalnum() and c != '\n' and c != ' ':
                    tmp += c
                    auto = 'symbol'
                elif (c.isalpha() or c == '_') and c != '\n' and c != ' ':
                    tmp += c
                    auto = 'names'

        return self.listToken
Ejemplo n.º 10
0
    def lex(self, getDirectives=False) -> list:
        tokens = []
        directives = []
        advance = self.advance
        # a [1] character marks the end of the raw text
        while self.ch != chr(1):
            # newlines, spaces, and indents have no response
            if (self.ch == "\n" or self.ch in " \t\r"):
                advance()

            # backslash characters
            elif (self.ch == "\\"):
                tokens.append(
                    Token(T.T_BSLASH, T.T_BSLASH, self.loc.copy(),
                          self.loc.copy()))
                advance()

            # pre-compiler directives
            elif (self.ch == "#"):

                advance()
                while (self.ch == " "):
                    advance()

                t = self.buildAmbiguous()
                t.tok = T.T_DIRECTIVE
                tokens.append(t)
                if (self.chidx < self.size - 1):
                    if (self.raw[self.chidx + 1] == "<"):
                        t2 = self.buildIncluder()
                        tokens.append(t2)

            # typecasts
            elif (self.ch == "$"):
                advance()
                t = self.buildAmbiguous()
                t.tok = T.T_TYPECAST

                while self.ch == "*":
                    advance()
                    t.value += "."

                tokens.append(t)

            # semicolon
            elif (self.ch == ";"):
                tokens.append(
                    Token(T.T_ENDL, T.T_ENDL, self.loc.copy(),
                          self.loc.copy()))
                advance()

            elif (self.ch == "+"):

                tokens.append(self.buildMultichar())

            elif (self.ch == "/"):
                advance()

                # managing comments:
                olchdx = self.chidx

                # single line comments:
                if (self.ch == "/"):
                    # find and jump to next newline
                    self.chidx = self.raw.find("\n", self.chidx) - 1
                    if self.chidx <= 0:
                        raise (UnexepectedEOFError(
                            Token('', '', self.loc, self.loc)))
                    self.loc.ch += self.chidx - olchdx
                    advance()

                # multiline comments:
                elif (self.ch == "*"):
                    # find and jump to next instance of '*/' in raw text
                    self.chidx = self.raw.find("*/", self.chidx) + 1
                    if self.chidx <= 0:
                        raise (UnexepectedEOFError(
                            Token('', '', self.loc, self.loc)))
                    self.loc.ch += self.chidx - olchdx
                    self.loc.line += self.raw.count("\n", olchdx, self.chidx)
                    advance()

                # not a comment
                else:
                    self.chidx -= 2
                    self.loc.ch -= 2
                    advance()
                    tokens.append(self.buildMultichar())

            elif (self.ch in "()}{[],@~?"):
                tokens.append(Token(self.ch, self.ch, self.loc.copy(), None))
                advance()
                tokens[-1].end = self.loc.copy()

            elif (self.ch == "-"):
                advance()
                prev = tokens[-1]
                if prev.tok not in [
                        T.T_INT, T.T_CHAR, T.T_DOUBLE, T.T_ID, T.T_CLSP
                ] and self.ch.isdigit():
                    t = self.buildNumber()
                    t.value = -t.value
                else:
                    self.chidx -= 2
                    self.loc.ch -= 2
                    advance()
                    t = self.buildMultichar()

                tokens.append(t)
            elif (self.ch in T.T_MULTIOP):
                token = self.buildMultichar()
                tokens.append(token)

            elif (self.ch == "."):

                if self.raw[self.chidx + 1].isdigit():
                    token = self.buildNumber()
                else:
                    token = self.buildMultichar()
                tokens.append(token)

            elif self.ch.isdigit():
                token = self.buildNumber()
                tokens.append(token)

            elif (self.ch == "\""):
                token = self.buildString()
                tokens.append(token)

            elif (self.ch == "'"):
                token = self.buildChar()
                tokens.append(token)

            elif (T.isidchar(ord(self.ch))):
                token = self.buildAmbiguous()
                tokens.append(token)

            else:
                print(ord(self.ch))
                raise (UnkownCharSequence(
                    Token(self.ch, self.ch, self.loc.copy(), self.loc.copy())))

        tokens.append(Token(T.T_EOF, T.T_EOF, self.loc.copy(),
                            self.loc.copy()))

        if (getDirectives):
            return directives

        return tokens