Example #1
0
        def regex_tokenizer(regex, ignore_white_space):
            d = {term.Name: term for term in self.G.terminals}
            tokens = []
            symbol_term = [
                term for term in self.G.terminals if term.Name == 'symbol'
            ][0]
            fixed_tokens = {
                tok.Name: Token(tok.Name, tok)
                for tok in [
                    d['|'], d['*'], d['+'], d['?'], d['('], d[')'], d['['],
                    d[']'], d['-'], d['ε']
                ]
            }

            for i, c in enumerate(regex):
                if c == '@' or (ignore_white_space and c.isspace()):
                    continue
                try:
                    token = fixed_tokens[c]
                    if regex[i - 1] == '@':
                        raise KeyError
                except KeyError:
                    token = Token(c, symbol_term)
                tokens.append(token)
            tokens.append(Token('$', self.G.EOF))
            return tokens
Example #2
0
 def preprocessor(self):
     pos = self.line_pos()
     tkn_value = ""
     while self.peek_char():
         tkn_value += self.peek_char()
         self.pop_char()
         if self.peek_sub_string(2) == "\\\n":
             self.__line_pos = 1
             self.__line += 1
         #    raise TokenError(self.line_pos())
         if self.peek_sub_string(2) in ["//", "/*"] \
                 or self.peek_char() == '\n':
             break
     if len(tkn_value) <= 1:
         raise TokenError(self.line_pos())
     tkn_key = tkn_value[1:].split()[0]
     if tkn_key not in preproc_keywords and tkn_key[:len('include')] != 'include':
         raise TokenError(self.line_pos())
     else:
         if tkn_key not in preproc_keywords and tkn_key[:len('include')] == 'include':
             tkn_key = 'include'
         self.tokens.append(Token(
                     preproc_keywords.get(tkn_key),
                     pos,
                     tkn_value))
Example #3
0
 def _add_dedent(self, token):
     """根据DEDENT.value个数 拆成value个DEDENT"""
     num = 0
     while num != token.value:
         num += 1
         self.token_stream.append(
             Token(tokens.DEDENT, None, token.line, token.column))
Example #4
0
 def string(self):
     """String constants can contain any characer except unescaped newlines.
         An unclosed string or unescaped newline is a fatal error and thus
         parsing will stop here.
     """
     pos = self.line_pos()
     tkn_value = ""
     if self.peek_char() == 'L':
         tkn_value += self.peek_char()
         self.pop_char()
     tkn_value += self.peek_char()
     self.pop_char()
     while self.peek_char() not in [None]:
         tkn_value += self.peek_char()
         if self.peek_sub_string(2) == "\\\n":
             self.__line += 1
             self.__line_pos = 1
         if self.peek_char() == '\"':
             break
         self.pop_char()
     else:
         raise TokenError(pos)
         return
     self.tokens.append(Token("STRING", pos, tkn_value))
     self.pop_char()
Example #5
0
 def char_constant(self):
     """Char constants follow pretty much the same rule as string constants
     """
     pos = self.line_pos()
     tkn_value = '\''
     self.pop_char()
     while self.peek_char():
         tkn_value += self.peek_char()
         if self.peek_char() == '\n':
             self.pop_char()
             self.tokens.append(Token("TKN_ERROR", pos))
             return
         if self.peek_char() == '\'':
             self.pop_char()
             self.tokens.append(Token("CHAR_CONST", pos, tkn_value))
             return
         self.pop_char()
     raise TokenError(pos)
Example #6
0
    def identifier(self):
        """Identifiers can start with any letter [a-z][A-Z] or an underscore
            and contain any letters [a-z][A-Z] digits [0-9] or underscores
        """
        pos = self.line_pos()
        tkn_value = ""
        while self.peek_char() and \
                (
                    self.peek_char() in string.ascii_letters + "0123456789_"
                    or self.peek_char() == "\\\n"):
            if self.peek_char() == "\\\n":
                self.pop_char()
                continue
            tkn_value += self.peek_char()
            self.pop_char()
        if tkn_value in keywords:
            self.tokens.append(Token(keywords[tkn_value], pos))

        else:
            self.tokens.append(Token("IDENTIFIER", pos, tkn_value))
Example #7
0
 def comment(self):
     """Comments are anything after '//' characters, up until a newline or
         end of file
     """
     pos = self.line_pos()
     tkn_value = "//"
     self.pop_char(), self.pop_char()
     while self.peek_char() is not None:
         if self.peek_char() == '\n':
             self.tokens.append(Token("COMMENT", pos, tkn_value))
             return
         tkn_value += self.peek_char()
         self.pop_char()
     raise TokenError(pos)
Example #8
0
    def operator(self):
        """Operators can be made of one or more sign, so the longest operators
            need to be looked up for first in order to avoid false positives
            eg: '>>' being understood as two 'MORE_THAN' operators instead of
                one 'RIGHT_SHIFT' operator
        """
        pos = self.line_pos()
        if self.peek_char() in ".+-*/%<>^&|!=":

            if self.peek_sub_string(3) in [">>=", "<<=", "..."]:
                self.tokens.append(Token(
                            operators[self.peek_sub_string(3)],
                            pos))
                self.__pos += 3

            elif self.peek_sub_string(2) in [">>", "<<", "->"]:
                self.tokens.append(Token(
                            operators[self.peek_sub_string(2)],
                            pos))
                self.__pos += 2

            elif self.peek_sub_string(2) == self.peek_char() + "=":
                self.tokens.append(Token(
                            operators[self.peek_sub_string(2)],
                            pos))
                self.pop_char(), self.pop_char()

            elif self.peek_char() in "+-<>=&|":
                if self.peek_sub_string(2) == self.peek_char() * 2:
                    self.tokens.append(Token(
                                operators[self.peek_sub_string(2)],
                                pos))
                    self.pop_char()
                    self.pop_char()

                else:
                    self.tokens.append(Token(
                                operators[self.peek_char()], pos))
                    self.pop_char()

            else:
                self.tokens.append(Token(
                        operators[self.peek_char()],
                        pos))
                self.pop_char()

        else:
            self.tokens.append(Token(
                    operators[self.src[self.__pos]],
                    pos))
            self.pop_char()
Example #9
0
 def mult_comment(self):
     pos = self.line_pos()
     self.pop_char(), self.pop_char()
     tkn_value = "/*"
     while self.peek_char():
         if self.src[self.__pos:].startswith("*/"):
             tkn_value += "*/"
             self.pop_char(), self.pop_char()
             break
         tkn_value += self.peek_char()
         if self.peek_char() == '\n':
             self.__line += 1
             self.__line_pos = 1
         self.pop_char()
     if tkn_value.endswith("*/"):
         self.tokens.append(Token("MULT_COMMENT", pos, tkn_value))
     else:
         raise TokenError(pos)
Example #10
0
    def get_next_token(self):
        """Peeks one character and tries to match it to a token type,
            if it doesn't match any of the token types, an error will be raised
            and current file's parsing will stop
        """
        while self.peek_char() is not None:
            if self.is_string():
                self.string()

            elif (self.peek_char().isalpha() and self.peek_char().isascii())or self.peek_char() == '_':
                self.identifier()

            elif self.is_constant():
                self.constant()

            elif self.is_char_constant():
                self.char_constant()

            elif self.peek_char() == '#':
                self.preprocessor()

            elif self.src[self.__pos:].startswith("/*"):
                self.mult_comment()

            elif self.src[self.__pos:].startswith("//"):
                self.comment()

            elif self.peek_char() in "+-*/,<>^&|!=%;:.~?":
                self.operator()

            elif self.peek_char() == ' ':
                self.tokens.append(Token("SPACE", self.line_pos()))
                self.pop_char()

            elif self.peek_char() == '\t':
                self.tokens.append(Token("TAB", self.line_pos()))
                self.pop_char()

            elif self.peek_char() == '\n':# or ord(self.peek_char()) == 8203:
                self.tokens.append(Token("NEWLINE", self.line_pos()))
                self.pop_char()
                self.__line_pos = 1
                self.__line += 1

            elif self.peek_char() == '\\\n':
                self.tokens.append(Token("ESCAPED_NEWLINE", self.line_pos()))
                self.pop_char()
                self.__line_pos = 1
                self.__line += 1

            elif self.peek_char() in brackets:
                self.tokens.append(Token(
                                    brackets[self.peek_char()],
                                    self.line_pos()))
                self.pop_char()

            else:
                raise TokenError(self.line_pos())

            return self.peek_token()

        return None
Example #11
0
    def constant(self):
        """Numeric constants can take many forms:
            - integer constants only allow digits [0-9]
            - real number constant only allow digits [0-9],
                ONE optionnal dot '.' and ONE optionnal 'e/E' character
            - binary constant only allow digits [0-1] prefixed by '0b' or '0B'
            - hex constant only allow digits [0-9], letters [a-f/A-F] prefixed
                by '0x' or '0X'
            - octal constants allow digits [0-9] prefixed by a zero '0'
                character

            Size ('l/L' for long) and sign ('u/U' for unsigned) specifiers can
            be appended to any of those. tokens

            Plus/minus operators ('+'/'-') can prefix any of those tokens

            a numeric constant could start with a '.' (dot character)
        """
        pos = self.line_pos()
        tkn_value = ""
        bucket = ".0123456789aAbBcCdDeEfFlLuUxX-+"
        while self.peek_char() and (self.peek_char() in bucket or self.peek_char() == "\\\n"):
            if self.peek_char() in "xX":
                if tkn_value.startswith("0") is False or len(tkn_value) > 1:
                    raise TokenError(pos)
                for c in "xX":
                    if c in tkn_value:
                        raise TokenError(pos)

            elif self.peek_char() in "bB":
                if tkn_value != "0" \
                        and tkn_value.startswith("0x") is False \
                        and tkn_value.startswith("0X") is False:
                    raise TokenError(pos)

            elif self.peek_char() in "+-":
                if tkn_value.endswith("e") is False \
                        and tkn_value.endswith("E") is False \
                        or self.peek_sub_string(2) in ["++", "--"]:
                    break

            elif self.peek_char() in "eE" \
                    and "0x" not in tkn_value and "0X" not in tkn_value:
                if "e" in tkn_value or "E" in tkn_value \
                        or "f" in tkn_value or "F" in tkn_value \
                        or "u" in tkn_value or "U" in tkn_value \
                        or "l" in tkn_value or "L" in tkn_value:
                    raise TokenError(pos)

            elif self.peek_char() in "lL":
                lcount = tkn_value.count("l") + tkn_value.count("L")
                if lcount > 1 or (lcount == 1 and tkn_value[-1] not in "lL") \
                        or ("f" in tkn_value or "F" in tkn_value) \
                        and "0x" not in tkn_value and "0X" not in tkn_value:
                    raise TokenError(pos)
                elif self.peek_char() == 'l' and 'L' in tkn_value \
                        or self.peek_char() == 'L' and 'l' in tkn_value:
                    raise TokenError(pos)

            elif self.peek_char() in "uU":
                if "u" in tkn_value or "U" in tkn_value \
                        or (("e" in tkn_value or "E" in tkn_value
                            or "f" in tkn_value or "F" in tkn_value)
                            and (
                                "0x" not in tkn_value
                                and "0X" not in tkn_value)):
                    raise TokenError(pos)

            elif self.peek_char() in "Ff":
                if tkn_value.startswith("0x") is False \
                        and tkn_value.startswith("0X") is False \
                        and (
                            "." not in tkn_value
                            or "f" in tkn_value
                            or "F" in tkn_value) \
                        or "u" in tkn_value or "U" in tkn_value \
                        or "l" in tkn_value or "L" in tkn_value:
                    raise TokenError(pos)

            elif self.peek_char() in "aAbBcCdDeE" \
                    and tkn_value.startswith("0x") is False \
                    and tkn_value.startswith("0X") is False \
                    or "u" in tkn_value or "U" in tkn_value \
                    or "l" in tkn_value or "L" in tkn_value:
                raise TokenError(pos)

            elif self.peek_char() in "0123456789" \
                    and "u" in tkn_value or "U" in tkn_value \
                    or "l" in tkn_value or "L" in tkn_value:
                raise TokenError(pos)

            elif self.peek_char() == '.' and '.' in tkn_value:
                raise TokenError(pos)

            tkn_value += self.peek_char()
            self.pop_char()
        if tkn_value[-1] in "eE" and tkn_value.startswith("0x") is False \
                or tkn_value[-1] in "xX":
            raise TokenError(pos)
        else:
            self.tokens.append(Token("CONSTANT", pos, tkn_value))
Example #12
0
 def make_token(self, type, value=None):
     return Token(type=type,
                  value=value,
                  line=self.line_no,
                  column=self.position)  # todo offset  '\t'