Beispiel #1
0
 def preprocessor(self):
     pos = self.line_pos()
     tkn_value = ""
     while self.peek_char():
         tkn_value += self.peek_char()
         self.pop_char()
         if self.peek_sub_string(2) == "\\\n":
             self.__line_pos = 1
             self.__line += 1
         #    raise TokenError(self.line_pos())
         if self.peek_sub_string(2) in ["//", "/*"
                                        ] or self.peek_char() == "\n":
             break
     if len(tkn_value) <= 1:
         raise TokenError(self.line_pos())
     tkn_key = tkn_value[1:].split()[0]
     if tkn_key not in preproc_keywords and tkn_key[:len("include"
                                                         )] != "include":
         raise TokenError(self.line_pos())
     else:
         if tkn_key not in preproc_keywords and tkn_key[:len(
                 "include")] == "include":
             tkn_key = "include"
         self.tokens.append(
             Token(preproc_keywords.get(tkn_key), pos, tkn_value))
Beispiel #2
0
 def string(self):
     """String constants can contain any characer except unescaped newlines.
     An unclosed string or unescaped newline is a fatal error and thus
     parsing will stop here.
     """
     pos = self.line_pos()
     tkn_value = ""
     if self.peek_char() == "L":
         tkn_value += self.peek_char()
         self.pop_char()
     tkn_value += self.peek_char()
     self.pop_char()
     while self.peek_char() not in [None]:
         tkn_value += self.peek_char()
         if self.peek_sub_string(2) == "\\\n":
             self.__line += 1
             self.__line_pos = 1
         if self.peek_char() == '"':
             break
         self.pop_char()
     else:
         raise TokenError(pos)
         return
     self.tokens.append(Token("STRING", pos, tkn_value))
     self.pop_char()
Beispiel #3
0
 def char_constant(self):
     """Char constants follow pretty much the same rule as string constants"""
     pos = self.line_pos()
     tkn_value = "'"
     self.pop_char()
     while self.peek_char():
         tkn_value += self.peek_char()
         if self.peek_char() == "\n":
             self.pop_char()
             self.tokens.append(Token("TKN_ERROR", pos))
             return
         if self.peek_char() == "'":
             self.pop_char()
             self.tokens.append(Token("CHAR_CONST", pos, tkn_value))
             return
         self.pop_char()
     raise TokenError(pos)
Beispiel #4
0
    def identifier(self):
        """Identifiers can start with any letter [a-z][A-Z] or an underscore
        and contain any letters [a-z][A-Z] digits [0-9] or underscores
        """
        pos = self.line_pos()
        tkn_value = ""
        while self.peek_char() and (self.peek_char()
                                    in string.ascii_letters + "0123456789_"
                                    or self.peek_char() == "\\\n"):
            if self.peek_char() == "\\\n":
                self.pop_char()
                continue
            tkn_value += self.peek_char()
            self.pop_char()
        if tkn_value in keywords:
            self.tokens.append(Token(keywords[tkn_value], pos))

        else:
            self.tokens.append(Token("IDENTIFIER", pos, tkn_value))
Beispiel #5
0
 def comment(self):
     """Comments are anything after '//' characters, up until a newline or
     end of file
     """
     pos = self.line_pos()
     tkn_value = "//"
     self.pop_char(), self.pop_char()
     while self.peek_char() is not None:
         if self.peek_char() == "\n":
             self.tokens.append(Token("COMMENT", pos, tkn_value))
             return
         tkn_value += self.peek_char()
         self.pop_char()
     raise TokenError(pos)
Beispiel #6
0
 def mult_comment(self):
     pos = self.line_pos()
     self.pop_char(), self.pop_char()
     tkn_value = "/*"
     while self.peek_char():
         if self.src[self.__pos:].startswith("*/"):
             tkn_value += "*/"
             self.pop_char(), self.pop_char()
             break
         tkn_value += self.peek_char()
         if self.peek_char() == "\n":
             self.__line += 1
             self.__line_pos = 1
         self.pop_char()
     if tkn_value.endswith("*/"):
         self.tokens.append(Token("MULT_COMMENT", pos, tkn_value))
     else:
         raise TokenError(pos)
Beispiel #7
0
    def operator(self):
        """Operators can be made of one or more sign, so the longest operators
        need to be looked up for first in order to avoid false positives
        eg: '>>' being understood as two 'MORE_THAN' operators instead of
            one 'RIGHT_SHIFT' operator
        """
        pos = self.line_pos()
        if self.peek_char() in ".+-*/%<>^&|!=":

            if self.peek_sub_string(3) in [">>=", "<<=", "..."]:
                self.tokens.append(
                    Token(operators[self.peek_sub_string(3)], pos))
                self.pop_char(), self.pop_char(), self.pop_char()

            elif self.peek_sub_string(2) in [">>", "<<", "->"]:
                self.tokens.append(
                    Token(operators[self.peek_sub_string(2)], pos))
                self.pop_char(), self.pop_char()

            elif self.peek_sub_string(2) == self.peek_char() + "=":
                self.tokens.append(
                    Token(operators[self.peek_sub_string(2)], pos))
                self.pop_char(), self.pop_char()

            elif self.peek_char() in "+-<>=&|":
                if self.peek_sub_string(2) == self.peek_char() * 2:
                    self.tokens.append(
                        Token(operators[self.peek_sub_string(2)], pos))
                    self.pop_char()
                    self.pop_char()

                else:
                    self.tokens.append(Token(operators[self.peek_char()], pos))
                    self.pop_char()

            else:
                self.tokens.append(Token(operators[self.peek_char()], pos))
                self.pop_char()

        else:
            self.tokens.append(Token(operators[self.src[self.__pos]], pos))
            self.pop_char()
Beispiel #8
0
    def get_next_token(self):
        """Peeks one character and tries to match it to a token type,
        if it doesn't match any of the token types, an error will be raised
        and current file's parsing will stop
        """
        while self.peek_char() is not None:
            if self.is_string():
                self.string()

            elif (self.peek_char().isalpha()
                  and self.peek_char().isascii()) or self.peek_char() == "_":
                self.identifier()

            elif self.is_constant():
                self.constant()

            elif self.is_char_constant():
                self.char_constant()

            elif self.peek_char() == "#":
                self.preprocessor()

            elif self.src[self.__pos:].startswith("/*"):
                self.mult_comment()

            elif self.src[self.__pos:].startswith("//"):
                self.comment()

            elif self.peek_char() in "+-*/,<>^&|!=%;:.~?":
                self.operator()

            elif self.peek_char() == " ":
                self.tokens.append(Token("SPACE", self.line_pos()))
                self.pop_char()

            elif self.peek_char() == "\t":
                self.tokens.append(Token("TAB", self.line_pos()))
                self.pop_char()

            elif self.peek_char() == "\n":  # or ord(self.peek_char()) == 8203:
                self.tokens.append(Token("NEWLINE", self.line_pos()))
                self.pop_char()
                self.__line_pos = 1
                self.__line += 1

            elif self.peek_char() == "\\\n":
                self.tokens.append(Token("ESCAPED_NEWLINE", self.line_pos()))
                self.pop_char()
                self.__line_pos = 1
                self.__line += 1

            elif self.peek_char() in brackets:
                self.tokens.append(
                    Token(brackets[self.peek_char()], self.line_pos()))
                self.pop_char()

            else:
                raise TokenError(self.line_pos())

            return self.peek_token()

        return None
Beispiel #9
0
    def constant(self):
        """Numeric constants can take many forms:
        - integer constants only allow digits [0-9]
        - real number constant only allow digits [0-9],
            ONE optionnal dot '.' and ONE optionnal 'e/E' character
        - binary constant only allow digits [0-1] prefixed by '0b' or '0B'
        - hex constant only allow digits [0-9], letters [a-f/A-F] prefixed
            by '0x' or '0X'
        - octal constants allow digits [0-9] prefixed by a zero '0'
            character

        Size ('l/L' for long) and sign ('u/U' for unsigned) specifiers can
        be appended to any of those. tokens

        Plus/minus operators ('+'/'-') can prefix any of those tokens

        a numeric constant could start with a '.' (dot character)
        """
        pos = self.line_pos()
        tkn_value = ""
        bucket = ".0123456789aAbBcCdDeEfFlLuUxX-+"
        while self.peek_char() and (self.peek_char() in bucket
                                    or self.peek_char() == "\\\n"):
            if self.peek_char() in "xX":
                if tkn_value.startswith("0") is False or len(tkn_value) > 1:
                    raise TokenError(pos)
                for c in "xX":
                    if c in tkn_value:
                        raise TokenError(pos)

            elif self.peek_char() in "bB":
                if tkn_value != "0" and tkn_value.startswith(
                        "0x") is False and tkn_value.startswith("0X") is False:
                    raise TokenError(pos)

            elif self.peek_char() in "+-":
                if (tkn_value.endswith("e") is False
                        and tkn_value.endswith("E") is False
                        or self.peek_sub_string(2) in ["++", "--"]):
                    break

            elif self.peek_char(
            ) in "eE" and "0x" not in tkn_value and "0X" not in tkn_value:
                if ("e" in tkn_value or "E" in tkn_value or "f" in tkn_value
                        or "F" in tkn_value or "u" in tkn_value
                        or "U" in tkn_value or "l" in tkn_value
                        or "L" in tkn_value):
                    raise TokenError(pos)

            elif self.peek_char() in "lL":
                lcount = tkn_value.count("l") + tkn_value.count("L")
                if (lcount > 1 or (lcount == 1 and tkn_value[-1] not in "lL")
                        or ("f" in tkn_value or "F" in tkn_value)
                        and "0x" not in tkn_value and "0X" not in tkn_value):
                    raise TokenError(pos)
                elif self.peek_char(
                ) == "l" and "L" in tkn_value or self.peek_char(
                ) == "L" and "l" in tkn_value:
                    raise TokenError(pos)

            elif self.peek_char() in "uU":
                if ("u" in tkn_value or "U" in tkn_value or
                    (("e" in tkn_value or "E" in tkn_value or "f" in tkn_value
                      or "F" in tkn_value) and
                     ("0x" not in tkn_value and "0X" not in tkn_value))):
                    raise TokenError(pos)

            elif self.peek_char() in "Ff":
                if (tkn_value.startswith("0x") is False
                        and tkn_value.startswith("0X") is False and
                    ("." not in tkn_value or "f" in tkn_value
                     or "F" in tkn_value) or "u" in tkn_value
                        or "U" in tkn_value or "l" in tkn_value
                        or "L" in tkn_value):
                    raise TokenError(pos)

            elif (self.peek_char() in "aAbBcCdDeE"
                  and tkn_value.startswith("0x") is False
                  and tkn_value.startswith("0X") is False or "u" in tkn_value
                  or "U" in tkn_value or "l" in tkn_value or "L" in tkn_value):
                raise TokenError(pos)

            elif (self.peek_char() in "0123456789" and "u" in tkn_value
                  or "U" in tkn_value or "l" in tkn_value or "L" in tkn_value):
                raise TokenError(pos)

            elif self.peek_char() == "." and "." in tkn_value:
                raise TokenError(pos)

            tkn_value += self.peek_char()
            self.pop_char()
        if tkn_value[-1] in "eE" and tkn_value.startswith(
                "0x") is False or tkn_value[-1] in "xX":
            raise TokenError(pos)
        else:
            self.tokens.append(Token("CONSTANT", pos, tkn_value))