Example #1
0
class EdgeSchemaLexer(lexer.Lexer):

    start_state = STATE_WS_SENSITIVE

    NL = 'NEWLINE'
    MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT'))
    RE_FLAGS = re.X | re.M | re.I

    # special placeholder error rule, that's not meant to be
    # explicitly inserted into the lexical productions
    error_rule = Rule(
        token='ERROR',
        next_state=STATE_KEEP,
        regexp=None,
    )

    # a few reused rules
    string_rule = Rule(token='STRING',
                       next_state=STATE_KEEP,
                       regexp=r'''
           (?P<Q>
               # capture the opening quote in group Q
               (
                   ' | " |
                   {dollar_quote}
               )
           )
           (?:
               (\\['"] | \n | .)*?
           )
           (?P=Q)  # match closing quote type with whatever is in Q
        '''.format(dollar_quote=re_dquote))

    ident_rule = Rule(token='IDENT',
                      next_state=STATE_KEEP,
                      regexp=r'[^\W\d]\w*')

    qident_rule = Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`.+?`')

    comment_rule = Rule(token='COMMENT',
                        next_state=STATE_KEEP,
                        regexp=r'\#[^\n]*$')

    line_cont_rule = Rule(token='LINECONT',
                          next_state=STATE_KEEP,
                          regexp=r'\\\n')

    bad_line_cont_rule = Rule(token='BADLINECONT',
                              next_state=STATE_KEEP,
                              regexp=r'\\.+?$')

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in edge_schema_keywords.items()
        if tok[0] not in {'LINK', 'TO', 'EXTENDING'}
    ]

    common_rules = keyword_rules + [
        Rule(token='LINK', next_state=STATE_KEEP, regexp=r'\bLINK\b'),

        # need to handle 'EXTENDING' differently based on whether it's
        # followed by '('
        Rule(token='EXTENDING',
             next_state=STATE_RAW_TYPE,
             regexp=r'\bEXTENDING\b(?!$|\s*\()'),
        Rule(token='EXTENDING', next_state=STATE_KEEP,
             regexp=r'\bEXTENDING\b'),
        comment_rule,
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        line_cont_rule,
        bad_line_cont_rule,
        Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='LPAREN',
             next_state=push_state(STATE_RAW_PAREN),
             regexp=r'\('),
        Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'),
        Rule(token='COMMA', next_state=STATE_KEEP, regexp=r'\,'),
        Rule(token='DOUBLECOLON', next_state=STATE_KEEP, regexp=r'::'),
        Rule(token='ASSIGN', next_state=STATE_RAW_STRING, regexp=r':='),
        Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'),

        # need to handle '->' differently based on whether it's
        # followed by '('
        Rule(token='ARROW', next_state=STATE_RAW_TYPE,
             regexp=r'->(?!$|\s*\()'),
        Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'),
        Rule(token='DOT', next_state=STATE_KEEP, regexp=r'\.'),
        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),
        string_rule,
        ident_rule,
        qident_rule,
    ]

    states = {
        STATE_WS_SENSITIVE:
        list(common_rules),
        STATE_WS_INSENSITIVE:
        list(common_rules),
        STATE_RAW_PAREN: [
            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),
            Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'),
            string_rule,
            qident_rule,
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^()`'"\\][^()`'"$\\]*'''),
        ],
        STATE_RAW_ANGLE: [
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),
            Rule(token='RAWSTRING', next_state=pop_state, regexp=r'\>'),
            string_rule,
            qident_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''),
        ],
        STATE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),
            Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'::'),
            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),
            string_rule,
            qident_rule,
            comment_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''),
        ],
        STATE_RAW_STRING: [
            Rule(token='NEWLINE',
                 next_state=STATE_KEEP,
                 regexp=r'(?<=:[=>])\s*\n'),
            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?<=:[=>])[^\n]+?$'),
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]*\n'),
            Rule(token='RAWLEADWS', next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]+'),

            # 0 indentation is the end of a raw string block
            Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^(?=\S)'),
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'.*?(?:\n|.$)'),
        ],
    }

    def token_from_text(self, rule_token, txt):
        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def get_start_tokens(self):
        '''Yield a number of start tokens.'''
        return ()

    def get_eof_tokens(self):
        '''Yield a number of EOF tokens.'''
        if self.logical_line_started:
            yield self.token_from_text('NL', '')

        # decrease indentation level at the end of input
        while len(self.indent) > 1:
            self.indent.pop()
            yield self.token_from_text('DEDENT', '')

    def insert_token(self, toktype, token, pos='start'):
        return lexer.Token('',
                           type=toktype,
                           text='',
                           start=getattr(token, pos),
                           end=getattr(token, pos),
                           filename=self.filename)

    def process_indent(self,
                       last_indent,
                       cur_indent,
                       token,
                       *,
                       skip_indent=False,
                       allow_indent=True,
                       allow_dedent=True,
                       pos='start'):
        # first and foremost, the indentation cannot have tabs after a space
        if ' \t' in cur_indent:
            raise EdgeIndentationError('Tabs used after spaces on line {line}',
                                       line=token.start.line,
                                       col=token.start.column,
                                       filename=self.filename)

        if cur_indent == last_indent:
            # indentation matches
            pass

        elif allow_indent and cur_indent.startswith(last_indent):
            if not skip_indent:
                # increase indentation level
                self.indent.append(cur_indent)
                yield self.insert_token('INDENT', token, pos)

        elif allow_dedent and last_indent.startswith(cur_indent):
            # decrease indentation level
            self.indent.pop()

            while self.indent:
                yield self.insert_token('DEDENT', token, pos)
                prev_indent = self.indent[-1]

                if cur_indent == prev_indent:
                    # indentation matches
                    return

                elif prev_indent.startswith(cur_indent):
                    # keep popping
                    self.indent.pop()

                else:
                    # it's not a match and current indent is no longer
                    # a proper prefix
                    raise EdgeIndentationError(
                        'Unexpected indentation level decrease on line {line}',
                        line=token.start.line,
                        col=token.start.column,
                        filename=self.filename)

            raise EdgeIndentationError(
                'Unexpected indentation level decrease on line {line}',
                line=token.start.line,
                col=token.start.column,
                filename=self.filename)

        else:
            # neither indentation is the prefix of the the other,
            # which is an error
            raise EdgeIndentationError(
                'Inconsistent indentation on line {line}',
                line=token.start.line,
                col=token.start.column,
                filename=self.filename)

    def token_generator(self, token):
        """Given the current lexer token, yield one or more tokens."""

        tok_type = token.type

        # update current possible indent
        if tok_type == 'RAWLEADWS' or (not self.logical_line_started
                                       and self._state == STATE_WS_SENSITIVE
                                       and tok_type == 'WS'):
            self.cur_indent = token.value

        # initialize the indentation if it's still empty
        if not self.indent:
            if tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}:
                self.indent.append(self.cur_indent or '')

        last_indent = self.indent[-1] if self.indent else None
        cur_indent = self.cur_indent

        # handle indentation
        if (self._state == STATE_WS_SENSITIVE and not self.logical_line_started
                and tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}):

            self.cur_indent = None

            # we have potential indentation change
            for t in self.process_indent(last_indent, cur_indent, token):
                yield t

        # indentation of raw strings
        elif self._state == STATE_RAW_STRING:
            if not self.logical_line_started and tok_type != 'NEWLINE':
                # we MUST indent here

                # we have potential indentation change
                if tok_type == 'RAWLEADWS':
                    for t in self.process_indent(last_indent,
                                                 cur_indent,
                                                 token,
                                                 allow_dedent=False,
                                                 pos='end'):
                        yield t

            elif tok_type == 'RAWLEADWS':
                dedented = False
                for t in self.process_indent(last_indent,
                                             cur_indent,
                                             token,
                                             skip_indent=True,
                                             pos='end'):

                    if not dedented:
                        yield self.insert_token('NL', token)
                        dedented = True

                    yield t

                if dedented:
                    self._next_state = STATE_WS_SENSITIVE
                    # alter the token type & adjust logical newline
                    token = token._replace(type='WS')
                    tok_type = 'WS'
                    self.logical_line_started = False

        # handle logical newline
        if tok_type not in {
                'NEWLINE', 'WS', 'COMMENT', 'LINECONT', 'ERROR', 'BADIDENT'
        }:
            self.logical_line_started = True

        elif (self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}
              and tok_type == 'NEWLINE'):

            # after any newline reset the indent
            self.cur_indent = ''
            # if there was a logical line, emit a special token
            if self.logical_line_started:
                yield self.insert_token('NL', token)
                self.logical_line_started = False

        if tok_type == 'LINECONT':
            # it is always an error to use line continuation mixed
            # into indentation
            if self.cur_indent is not None:
                # indentation level mismatch
                raise EdgeIndentationError(
                    'Illegal line continuation on line {line}',
                    line=token.start.line,
                    col=token.start.column,
                    filename=self.filename)

            if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}:
                token = token._replace(type='WS')
            else:
                token = token._replace(value='\n', type='RAWSTRING')

        yield token

    def lex(self):
        """Wrapper for the lexer."""

        self.indent = []
        self.state_stack = []
        self.logical_line_started = True
        self.prev_nw_tok = None  # previous NON-WHITESPACE token
        self.cur_indent = None
        self._next_state = None

        for tok in self._lex():
            # if we have any error token here, handle the error now
            if tok.type == 'BADIDENT':
                raise lexer.UnknownTokenError(
                    f"Illegal identifier '{tok.text}'",
                    line=tok.start.line,
                    col=tok.start.column,
                    filename=tok.filename)

            elif tok.type == 'ERROR':
                if self.prev_nw_tok.type == 'NL':
                    # if it looks like this happened right after a new
                    # line, attempt to generate dedents so that the
                    # parser may have a better error if the previous
                    # line cannot be parsed
                    for eoftok in self.get_eof_tokens():
                        yield eoftok

                raise lexer.UnknownTokenError(f"Unexpected '{tok.text}'",
                                              line=tok.start.line,
                                              col=tok.start.column,
                                              filename=tok.filename)

            if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}:
                self.prev_nw_tok = tok
            yield tok

    def _lex(self):
        """Lexes the src.

        Generator. Yields tokens (as defined by the rules).

        May yield special start and EOF tokens.
        May raise UnknownTokenError exception."""

        src = self.inputstr

        for tok in self.get_start_tokens():
            yield tok

        while self.start < self.end:
            for match in self.re_states[self._state].finditer(src, self.start):
                rule_id = match.lastgroup

                txt = match.group(rule_id)

                if rule_id == 'err':
                    rule = self.error_rule
                else:
                    rule = Rule._map[rule_id]

                rule_token = rule.token

                token = self.token_from_text(rule_token, txt)

                # the next state must be evaluated before yielding the
                # next token so that we have access to prevtok
                if rule.next_state:
                    # the next state can be callable
                    if callable(rule.next_state):
                        next_state = rule.next_state(self)
                    else:
                        next_state = rule.next_state
                else:
                    next_state = STATE_KEEP

                for tok in self.token_generator(token):
                    yield tok

                if next_state and next_state != self._state:
                    # Rule dictates that the lexer state should be
                    # switched
                    self._state = next_state
                    break
                elif self._next_state is not None:
                    self._state = self._next_state
                    self._next_state = None
                    break

        # End of file
        for tok in self.get_eof_tokens():
            yield tok
Example #2
0
class EdgeSchemaLexer(lexer.Lexer):

    start_state = STATE_WS_SENSITIVE

    NL = 'NEWLINE'
    MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT'))
    RE_FLAGS = re.X | re.M | re.I

    # a few reused rules
    string_rule = Rule(
        token='STRING',
        next_state=STATE_KEEP,
        regexp=r'''
           (?P<Q>
               # capture the opening quote in group Q
               (
                   ' | " |
                   {dollar_quote}
               )
           )
           (?:
               (\\['"] | \n | .)*?
           )
           (?P=Q)  # match closing quote type with whatever is in Q
        '''.format(dollar_quote=re_dquote))

    ident_rule = Rule(
        token='IDENT',
        next_state=STATE_KEEP,
        regexp=r'[^\W\d]\w*')

    qident_rule = Rule(
        token='QIDENT',
        next_state=STATE_KEEP,
        regexp=r'`.+?`')

    comment_rule = Rule(
        token='COMMENT',
        next_state=STATE_KEEP,
        regexp=r'\#[^\n]*$')

    line_cont_rule = Rule(
        token='LINECONT',
        next_state=STATE_KEEP,
        regexp=r'\\\n')

    bad_line_cont_rule = Rule(
        token='BADLINECONT',
        next_state=STATE_KEEP,
        regexp=r'\\.+?$')

    # Basic keywords
    keyword_rules = [Rule(token=tok[0],
                          next_state=STATE_KEEP,
                          regexp=lexer.group(val))
                     for val, tok in edge_schema_keywords.items()
                     if tok[0] not in {'LINK', 'TO', 'EXTENDING', 'ATTRIBUTE'}]

    common_rules = keyword_rules + [
        Rule(token='LINK',
             next_state=STATE_KEEP,
             regexp=r'\bLINK\b'),

        # need to handle 'EXTENDING' differently based on whether it's
        # followed by '('
        Rule(token='EXTENDING',
             next_state=STATE_RAW_TYPE,
             regexp=r'\bEXTENDING\b(?!$|\s*\()'),

        Rule(token='EXTENDING',
             next_state=STATE_KEEP,
             regexp=r'\bEXTENDING\b'),

        Rule(token='ATTRIBUTE',
             next_state=STATE_ATTRIBUTE_RAW_TYPE,
             regexp=r'\bATTRIBUTE\b'),

        comment_rule,

        Rule(token='WS',
             next_state=STATE_KEEP,
             regexp=r'[^\S\n]+'),

        line_cont_rule,
        bad_line_cont_rule,

        Rule(token='NEWLINE',
             next_state=STATE_KEEP,
             regexp=r'\n'),

        Rule(token='LPAREN',
             next_state=push_state(STATE_RAW_PAREN),
             regexp=r'\('),

        Rule(token='RPAREN',
             next_state=pop_state,
             regexp=r'\)'),

        Rule(token='COMMA',
             next_state=STATE_KEEP,
             regexp=r'\,'),

        Rule(token='DOUBLECOLON',
             next_state=STATE_KEEP,
             regexp=r'::'),

        Rule(token='TURNSTILE',
             next_state=STATE_RAW_STRING,
             regexp=r':='),

        Rule(token='COLONGT',
             next_state=STATE_RAW_STRING,
             regexp=r':>'),

        Rule(token='COLON',
             next_state=STATE_KEEP,
             regexp=r':'),

        # need to handle '->' differently based on whether it's
        # followed by '('
        Rule(token='ARROW',
             next_state=STATE_RAW_TYPE,
             regexp=r'->(?!$|\s*\()'),

        Rule(token='ARROW',
             next_state=STATE_KEEP,
             regexp=r'->'),

        Rule(token='DOT',
             next_state=STATE_KEEP,
             regexp=r'\.'),

        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),

        string_rule,
        ident_rule,
        qident_rule,
    ]

    states = {
        STATE_WS_SENSITIVE: list(common_rules),
        STATE_WS_INSENSITIVE: list(common_rules),
        STATE_RAW_PAREN: [
            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),

            Rule(token='RPAREN',
                 next_state=pop_state,
                 regexp=r'\)'),

            string_rule,
            qident_rule,
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^()`'"\\][^()`'"$\\]*'''),
        ],
        STATE_RAW_ANGLE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            Rule(token='RAWSTRING',
                 next_state=pop_state,
                 regexp=r'\>'),

            string_rule,
            qident_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''),
        ],
        STATE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'::'),

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            string_rule,
            qident_rule,
            comment_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''),
        ],
        STATE_RAW_STRING: [
            Rule(token='NEWLINE',
                 next_state=STATE_KEEP,
                 regexp=r'(?<=:[=>])\s*\n'),

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?<=:[=>])[^\n]+?$'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]*\n'),

            Rule(token='RAWLEADWS',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]+'),

            # 0 indentation is the end of a raw string block
            Rule(token='RAWLEADWS',
                 next_state=STATE_KEEP,
                 regexp=r'^(?=\S)'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'.*?(?:\n|.$)'),
        ],
        STATE_ATTRIBUTE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'::'),

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),

            string_rule,
            qident_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<(`'"\n\\][^:<(`'"$\n\\]*'''),
        ],
    }

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def get_start_tokens(self):
        '''Yield a number of start tokens.'''
        return ()

    def get_eof_tokens(self):
        '''Yield a number of EOF tokens.'''
        if self.logical_line_started:
            yield self.token_from_text('NL', '')

        # decrease indentation level at the end of input
        while len(self.indent) > 1:
            self.indent.pop()
            yield self.token_from_text('DEDENT', '')

    def insert_token(self, toktype, token, pos='start'):
        return lexer.Token('', type=toktype, text='',
                           start=getattr(token, pos),
                           end=getattr(token, pos),
                           filename=self.filename)

    def token_generator(self, token):
        """Given the current lexer token, yield one or more tokens."""

        tok_type = token.type

        # initialize the indentation if it's still empty
        if not self.indent:
            if tok_type == 'WS':
                self.indent.append(token.end.column - 1)
            else:
                self.indent.append(0)

        # handle indentation
        if (self._state == STATE_WS_SENSITIVE and
                not self.logical_line_started and
                tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}):

            # we have potential indentation change
            last_indent = self.indent[-1]
            cur_indent = token.start.column - 1

            if cur_indent > last_indent:
                # increase indentation level
                self.indent.append(cur_indent)
                yield self.insert_token('INDENT', token)

            elif cur_indent < last_indent:
                # decrease indentation level
                while self.indent[-1] > cur_indent:
                    self.indent.pop()
                    if self.indent[-1] < cur_indent:
                        # indentation level mismatch
                        raise EdgeIndentationError(
                            'Incorrect unindent at {position}',
                            line=token.start.line,
                            col=token.start.column,
                            filename=self.filename)

                    yield self.insert_token('DEDENT', token)

        # indentation of raw strings
        elif self._state == STATE_RAW_STRING:
            last_indent = self.indent[-1]
            # only valid for RAWLEADWS
            cur_indent = len(token.value)

            if not self.logical_line_started and tok_type != 'NEWLINE':
                # we MUST indent here
                if (tok_type == 'RAWLEADWS' and
                        cur_indent > last_indent):
                    # increase indentation level
                    self.indent.append(cur_indent)
                    yield self.insert_token('INDENT', token, 'end')

                elif token.value.strip():
                    # indentation level mismatch
                    raise EdgeIndentationError(
                        'Incorrect indentation at {position}',
                        line=token.end.line,
                        col=token.end.column,
                        filename=self.filename)

            elif (tok_type == 'RAWLEADWS' and
                    cur_indent < last_indent):
                # check indentation level of each RAWLEADWS,
                # exiting the current state and issuing a NL and DEDENT
                # tokens if indentation falls below starting value
                yield self.insert_token('NL', token)

                while self.indent[-1] > cur_indent:
                    self.indent.pop()
                    if self.indent[-1] < cur_indent:
                        # indentation level mismatch
                        raise EdgeIndentationError(
                            'Incorrect unindent at {position}',
                            line=token.end.line,
                            col=token.end.column,
                            filename=self.filename)

                    yield self.insert_token('DEDENT', token, 'end')

                self._next_state = STATE_WS_SENSITIVE
                # alter the token type & adjust logical newline
                token = token._replace(type='WS')
                tok_type = 'WS'
                self.logical_line_started = False

        # handle logical newline
        if (self.logical_line_started and
                self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING} and
                tok_type == 'NEWLINE'):
            yield self.insert_token('NL', token)
            self.logical_line_started = False

        elif tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}:
            self.logical_line_started = True

        if tok_type == 'LINECONT':
            if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}:
                token = token._replace(type='WS')
            else:
                token = token._replace(value='\n', type='RAWSTRING')

        yield token

    def lex(self):
        """Wrapper for the lexer."""

        self.indent = []
        self.state_stack = []
        self.logical_line_started = True
        self.prev_nw_tok = None  # previous NON-WHITEPASE token
        self._next_state = None

        for tok in self._lex():
            if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}:
                self.prev_nw_tok = tok
            yield tok

    def _lex(self):
        """Lexes the src.

        Generator. Yields tokens (as defined by the rules).

        May yield special start and EOF tokens.
        May raise UnknownTokenError exception."""

        src = self.inputstr

        for tok in self.get_start_tokens():
            yield tok

        while self.start < self.end:
            for match in self.re_states[self._state].finditer(src, self.start):
                rule_id = match.lastgroup

                txt = match.group(rule_id)

                if rule_id == 'err':
                    # Error group -- no rule has been matched
                    self.handle_error(txt)

                rule = Rule._map[rule_id]
                rule_token = rule.token

                token = self.token_from_text(rule_token, txt)

                # the next state must be evaluated before yielding the
                # next token so that we have access to prevtok
                if rule.next_state:
                    # the next state can be callable
                    if callable(rule.next_state):
                        next_state = rule.next_state(self)
                    else:
                        next_state = rule.next_state
                else:
                    next_state = STATE_KEEP

                for tok in self.token_generator(token):
                    yield tok

                if next_state and next_state != self._state:
                    # Rule dictates that the lexer state should be
                    # switched
                    self._state = next_state
                    break
                elif self._next_state is not None:
                    self._state = self._next_state
                    self._next_state = None
                    break

        # End of file
        for tok in self.get_eof_tokens():
            yield tok
Example #3
0
class EdgeQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    MERGE_TOKENS = {('NAMED', 'ONLY'), ('SET', 'ATTRIBUTE')}

    NL = 'NL'
    MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST', 'RSCONST'))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [Rule(token=tok[0],
                          next_state=STATE_KEEP,
                          regexp=lexer.group(val))
                     for val, tok in edgeql_keywords.items()]

    common_rules = keyword_rules + [
        Rule(token='WS',
             next_state=STATE_KEEP,
             regexp=r'[^\S\n]+'),

        Rule(token='NL',
             next_state=STATE_KEEP,
             regexp=r'\n'),

        Rule(token='COMMENT',
             next_state=STATE_KEEP,
             regexp=r'''\#.*?$'''),

        Rule(token='ASSIGN',
             next_state=STATE_KEEP,
             regexp=r':='),

        Rule(token='ARROW',
             next_state=STATE_KEEP,
             regexp=r'->'),

        Rule(token='??',
             next_state=STATE_KEEP,
             regexp=r'\?\?'),

        Rule(token='::',
             next_state=STATE_KEEP,
             regexp=r'::'),

        # special path operators
        Rule(token='.<',
             next_state=STATE_KEEP,
             regexp=r'\.<'),

        Rule(token='.>',
             next_state=STATE_KEEP,
             regexp=r'\.>'),

        Rule(token='//',
             next_state=STATE_KEEP,
             regexp=r'//'),

        Rule(token='++',
             next_state=STATE_KEEP,
             regexp=r'\+\+'),

        Rule(token='OP',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: >= | <= | != | \?= | \?!=)
             '''),

        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'),

        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d+)?
                        (?:[eE](?:[+\-])?[0-9]+)
                    )
                    |
                    (?: \d+\.\d+)
                """),

        Rule(token='ICONST',
             next_state=STATE_KEEP,
             regexp=r'([1-9]\d* | 0)(?![0-9])'),

        Rule(token='BCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?:
                    b
                )
                (?P<BQ>
                    ' | "
                )
                (?:
                    (
                        \\\\ | \\['"] | \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=BQ)
             '''),

        Rule(token='RSCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?:
                    r
                )?
                (?P<RQ>
                    (?:
                        (?<=r) (?: ' | ")
                    ) | (?:
                        (?<!r) (?: {re_dquote})
                    )
                )
                (?:
                    (
                        \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=RQ)
             '''),

        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?P<Q>
                    ' | "
                )
                (?:
                    (
                        \\\\ | \\['"] | \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=Q)
             '''),

        # this rule will capture malformed strings and allow us to
        # provide better error messages
        Rule(token='BADSCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                [rb]?
                (['"] | (?: {re_dquote}))
                [^\n]*
             '''),

        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),

        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'[^\W\d]\w*'),

        Rule(token='QIDENT',
             next_state=STATE_KEEP,
             regexp=r'`[^@].*?`'),

        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[\{\}$]'),
    ]

    states = {
        STATE_BASE:
            common_rules,
    }

    def __init__(self):
        super().__init__()
        # add capacity to handle a few tokens composed of 2 elements
        self._possible_long_token = {x[0] for x in self.MERGE_TOKENS}
        self._long_token_match = {x[1]: x[0] for x in self.MERGE_TOKENS}

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADSCONST':
            raise lexer.UnknownTokenError(
                f"Unterminated string {txt}",
                line=self.lineno, col=self.column, filename=self.filename)
        elif rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def lex(self):
        buffer = []

        for tok in super().lex():
            tok_type = tok.type

            if tok_type in {'WS', 'NL', 'COMMENT'}:
                # Strip out whitespace and comments
                continue

            elif tok_type in self._possible_long_token:
                # Buffer in case this is a merged token
                if not buffer:
                    buffer.append(tok)
                else:
                    yield from iter(buffer)
                    buffer[:] = [tok]

            elif tok_type in self._long_token_match:
                prev_token = buffer[-1] if buffer else None
                if (prev_token and
                        prev_token.type == self._long_token_match[tok_type]):
                    tok = prev_token._replace(
                        value=prev_token.value + ' ' + tok.value,
                        type=prev_token.type + tok_type)
                    buffer.pop()
                yield tok

            else:
                if buffer:
                    yield from iter(buffer)
                    buffer[:] = []
                yield tok

    def lex_highlight(self):
        return super().lex()
Example #4
0
class EdgeQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = 'NL'
    MULTILINE_TOKENS = frozenset(('SCONST', ))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in edgeql_keywords.items()
    ]

    common_rules = keyword_rules + [
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'''\#.*?$'''),
        Rule(token='TURNSTILE', next_state=STATE_KEEP, regexp=r':='),
        Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'),
        Rule(token='??', next_state=STATE_KEEP, regexp=r'\?\?'),
        Rule(token='::', next_state=STATE_KEEP, regexp=r'::'),

        # special path operators
        Rule(token='.<', next_state=STATE_KEEP, regexp=r'\.<'),
        Rule(token='.>', next_state=STATE_KEEP, regexp=r'\.>'),
        Rule(token='OP',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: >= | <= | != | \?= | \?!=)
             '''),

        # SQL ops
        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d+)?
                        (?:[eE](?:[+\-])?[0-9]+)
                    )
                    |
                    (?: \d+\.\d+)
                """),
        Rule(token='ICONST',
             next_state=STATE_KEEP,
             regexp=r'([1-9]\d* | 0)(?![0-9])'),
        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?P<Q>
                    # capture the opening quote in group Q
                    (
                        ' | " |
                        {re_dquote}
                    )
                )
                (?:
                    (\\['"] | \n | .)*?
                )
                (?P=Q)      # match closing quote type with whatever is in Q
             '''),
        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),
        Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*'),
        Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`[^@].*?`'),
        Rule(token='self', next_state=STATE_KEEP, regexp=r'[\{\}$]'),
    ]

    states = {
        STATE_BASE: common_rules,
    }

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def lex(self):
        buffer = []

        for tok in super().lex():
            tok_type = tok.type

            if tok_type in {'WS', 'NL', 'COMMENT'}:
                # Strip out whitespace and comments
                continue
            else:
                if buffer:
                    yield from iter(buffer)
                    buffer[:] = []
                yield tok

    def lex_highlight(self):
        return super().lex()
Example #5
0
class PgSQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = frozenset('NL')
    MULTILINE_TOKENS = frozenset(('COMMENT', 'SCONST'))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [
        Rule(token='KEYWORD',
             next_state=STATE_KEEP,
             regexp=lexer.group(*pg_keywords.keys()))
    ]

    common_rules = keyword_rules + [
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='COMMENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:/\*(?:.|\n)*?\*/)
                    | (?:--.*?$)
                '''),
        Rule(token='TYPECAST', next_state=STATE_KEEP, regexp=r'::'),

        # multichar ops (so 2+ chars)
        Rule(token='Op',
             next_state=STATE_KEEP,
             regexp=r'''
                # EdgeQL-specific multi-char ops
                {opchar_pg} (?:{opchar}(?!/\*|--))+
                |
                (?:{opchar}(?!/\*|--))+ {opchar_pg} (?:{opchar}(?!/\*|--))*
                |
                # SQL-only multi-char ops cannot end in + or -
                (?:{opchar_sql}(?!/\*|--))+[*/^%<>=]
             '''.format(opchar_pg=re_opchars_pgsql,
                        opchar=re_opchars,
                        opchar_sql=re_opchars_sql)),

        # PgSQL single char ops
        Rule(token='Op', next_state=STATE_KEEP, regexp=re_opchars_pgsql),

        # SQL ops
        Rule(token='self', next_state=STATE_KEEP, regexp=re_self),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d*)?
                        |
                        \. \d+
                    ) {exppart}
                """.format(exppart=re_exppart)),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: \d+\.(?!\.)\d*
                    |
                    \.\d+)
             '''),
        Rule(token='ICONST', next_state=STATE_KEEP, regexp=r'\d+'),
        Rule(token='BCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                B'(?:
                    [01]
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),
        Rule(token='XCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                X'(?:
                    [\da-fA-F]
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),

        # don't have extra checks for correct escaping inside
        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                [nNeE]?
                '(?:
                    [^']
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),

        # dollar quoted strings
        Rule(token='DQCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                \$(?P<dq> (?:{ident_start}{ident_cont}*)? )\$
                    .*?
                \$(?P=dq)\$
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),

        # specifying custom escape character
        Rule(token='UESCAPE',
             next_state=STATE_KEEP,
             regexp=r"""UESCAPE\s+'[^a-fA-F\d\s+'"]'"""),

        # quoted identifier
        Rule(token='QIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:U&)?
                    "(?:
                        [^"]
                        |
                        ""
                    )+"
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),
        Rule(token='PARAM', next_state=STATE_KEEP, regexp=r'\$\d+'),
        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    {ident_start}{ident_cont}*
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),
    ]

    states = {
        STATE_BASE: common_rules,
    }

    def token_from_text(self, rule_token, txt):
        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'IDENT':
            tok = tok._replace(value=txt.lower())

        elif rule_token == 'KEYWORD':
            # process keywords here since having separate rules for them
            # creates > 100 re groups.
            txt_low = txt.lower()
            tok = tok._replace(value=txt_low, type=pg_keywords[txt_low][0])

        elif rule_token in ('SCONST', 'BCONST', 'XCONST'):
            txt = txt[:-1].split("'", 1)[1]
            txt = clean_string.sub('', txt.replace("''", "'"))
            tok = tok._replace(value=txt)

        elif rule_token == 'PARAM':
            tok = tok._replace(value=txt[1:])

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[:-1].split('"', 1)[1])

        elif rule_token == 'DQCONST':
            txt = txt.rsplit("$", 2)[2]
            txt = txt.split("$", 2)[2]
            tok = tok._replace(type='SCONST', value=txt)

        return tok
Example #6
0
class GraphQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = 'NL'
    RE_FLAGS = re.X | re.M

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in graphql_keywords.items()
    ]

    common_rules = keyword_rules + [
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\r\n|\n|\r'),
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[ \t]+'),
        Rule(token='COMMA', next_state=STATE_KEEP, regexp=r','),
        Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'\#[^\n]*$'),
        Rule(token='LPAREN', next_state=STATE_KEEP, regexp=r'\('),
        Rule(token='RPAREN', next_state=STATE_KEEP, regexp=r'\)'),
        Rule(token='LSBRACKET', next_state=STATE_KEEP, regexp=r'\['),
        Rule(token='RSBRACKET', next_state=STATE_KEEP, regexp=r'\]'),
        Rule(token='LCBRACKET', next_state=STATE_KEEP, regexp=r'\{'),
        Rule(token='RCBRACKET', next_state=STATE_KEEP, regexp=r'\}'),
        Rule(token='BANG', next_state=STATE_KEEP, regexp=r'\!'),
        Rule(token='ELLIPSIS', next_state=STATE_KEEP, regexp=r'\.\.\.'),
        Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'),
        Rule(token='EQUAL', next_state=STATE_KEEP, regexp=r'='),
        Rule(token='AT', next_state=STATE_KEEP, regexp=r'@'),
        Rule(token='INTEGER',
             next_state=STATE_KEEP,
             regexp=r'-?(?:0|[1-9][0-9]*)(?![eE.0-9])'),
        Rule(token='FLOAT',
             next_state=STATE_KEEP,
             regexp=r'''
                -?(0|[1-9][0-9]*)
                    (\.[0-9]+)?
                        ([eE][+-]?[0-9]+)?
                        (?![eE.0-9])  # must not be followed by a number
             '''),
        Rule(token='STRING',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:r)?" [^\n]*?
                    (?<!\\)"
             '''),
        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'[_A-Za-z][_0-9A-Za-z]*'),
        Rule(token='VAR', next_state=STATE_KEEP, regexp=r'\$[_0-9A-Za-z]+'),
        Rule(token='DOLLAR', next_state=STATE_KEEP, regexp=r'\$'),
    ]

    states = {
        STATE_BASE: list(common_rules),
    }

    def handle_error(self, txt):
        # check if this is unterminated string instead of a generic error
        if txt == '"':

            pos = re.compile(r'$',
                             self.RE_FLAGS).search(self.inputstr,
                                                   self.start).start()
            pos += self.column - self.start
            raise UnterminatedStringError(
                'unterminated string token {position}',
                line=self.lineno,
                col=pos,
                filename=self.filename)

        super().handle_error(txt)