Python group Examples

Programming Language: Python

Namespace/Package Name: edb.lang.common.lexer

Method/Function: group

Examples at hotexamples.com: 6

Python group - 6 examples found. These are the top rated real world Python examples of edb.lang.common.lexer.group extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: lexer.py Project: dungeon2567/edgedb

class EdgeSchemaLexer(lexer.Lexer):

    start_state = STATE_WS_SENSITIVE

    NL = 'NEWLINE'
    MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT'))
    RE_FLAGS = re.X | re.M | re.I

    # special placeholder error rule, that's not meant to be
    # explicitly inserted into the lexical productions
    error_rule = Rule(
        token='ERROR',
        next_state=STATE_KEEP,
        regexp=None,
    )

    # a few reused rules
    string_rule = Rule(token='STRING',
                       next_state=STATE_KEEP,
                       regexp=r'''
           (?P<Q>
               # capture the opening quote in group Q
               (
                   ' | " |
                   {dollar_quote}
               )
           )
           (?:
               (\\['"] | \n | .)*?
           )
           (?P=Q)  # match closing quote type with whatever is in Q
        '''.format(dollar_quote=re_dquote))

    ident_rule = Rule(token='IDENT',
                      next_state=STATE_KEEP,
                      regexp=r'[^\W\d]\w*')

    qident_rule = Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`.+?`')

    comment_rule = Rule(token='COMMENT',
                        next_state=STATE_KEEP,
                        regexp=r'\#[^\n]*$')

    line_cont_rule = Rule(token='LINECONT',
                          next_state=STATE_KEEP,
                          regexp=r'\\\n')

    bad_line_cont_rule = Rule(token='BADLINECONT',
                              next_state=STATE_KEEP,
                              regexp=r'\\.+?$')

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in edge_schema_keywords.items()
        if tok[0] not in {'LINK', 'TO', 'EXTENDING'}
    ]

    common_rules = keyword_rules + [
        Rule(token='LINK', next_state=STATE_KEEP, regexp=r'\bLINK\b'),

        # need to handle 'EXTENDING' differently based on whether it's
        # followed by '('
        Rule(token='EXTENDING',
             next_state=STATE_RAW_TYPE,
             regexp=r'\bEXTENDING\b(?!$|\s*\()'),
        Rule(token='EXTENDING', next_state=STATE_KEEP,
             regexp=r'\bEXTENDING\b'),
        comment_rule,
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        line_cont_rule,
        bad_line_cont_rule,
        Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='LPAREN',
             next_state=push_state(STATE_RAW_PAREN),
             regexp=r'\('),
        Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'),
        Rule(token='COMMA', next_state=STATE_KEEP, regexp=r'\,'),
        Rule(token='DOUBLECOLON', next_state=STATE_KEEP, regexp=r'::'),
        Rule(token='ASSIGN', next_state=STATE_RAW_STRING, regexp=r':='),
        Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'),

        # need to handle '->' differently based on whether it's
        # followed by '('
        Rule(token='ARROW', next_state=STATE_RAW_TYPE,
             regexp=r'->(?!$|\s*\()'),
        Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'),
        Rule(token='DOT', next_state=STATE_KEEP, regexp=r'\.'),
        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),
        string_rule,
        ident_rule,
        qident_rule,
    ]

    states = {
        STATE_WS_SENSITIVE:
        list(common_rules),
        STATE_WS_INSENSITIVE:
        list(common_rules),
        STATE_RAW_PAREN: [
            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),
            Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'),
            string_rule,
            qident_rule,
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^()`'"\\][^()`'"$\\]*'''),
        ],
        STATE_RAW_ANGLE: [
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),
            Rule(token='RAWSTRING', next_state=pop_state, regexp=r'\>'),
            string_rule,
            qident_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''),
        ],
        STATE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),
            Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'::'),
            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),
            string_rule,
            qident_rule,
            comment_rule,
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''),
        ],
        STATE_RAW_STRING: [
            Rule(token='NEWLINE',
                 next_state=STATE_KEEP,
                 regexp=r'(?<=:[=>])\s*\n'),
            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?<=:[=>])[^\n]+?$'),
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]*\n'),
            Rule(token='RAWLEADWS', next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]+'),

            # 0 indentation is the end of a raw string block
            Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^(?=\S)'),
            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'.*?(?:\n|.$)'),
        ],
    }

    def token_from_text(self, rule_token, txt):
        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def get_start_tokens(self):
        '''Yield a number of start tokens.'''
        return ()

    def get_eof_tokens(self):
        '''Yield a number of EOF tokens.'''
        if self.logical_line_started:
            yield self.token_from_text('NL', '')

        # decrease indentation level at the end of input
        while len(self.indent) > 1:
            self.indent.pop()
            yield self.token_from_text('DEDENT', '')

    def insert_token(self, toktype, token, pos='start'):
        return lexer.Token('',
                           type=toktype,
                           text='',
                           start=getattr(token, pos),
                           end=getattr(token, pos),
                           filename=self.filename)

    def process_indent(self,
                       last_indent,
                       cur_indent,
                       token,
                       *,
                       skip_indent=False,
                       allow_indent=True,
                       allow_dedent=True,
                       pos='start'):
        # first and foremost, the indentation cannot have tabs after a space
        if ' \t' in cur_indent:
            raise EdgeIndentationError('Tabs used after spaces on line {line}',
                                       line=token.start.line,
                                       col=token.start.column,
                                       filename=self.filename)

        if cur_indent == last_indent:
            # indentation matches
            pass

        elif allow_indent and cur_indent.startswith(last_indent):
            if not skip_indent:
                # increase indentation level
                self.indent.append(cur_indent)
                yield self.insert_token('INDENT', token, pos)

        elif allow_dedent and last_indent.startswith(cur_indent):
            # decrease indentation level
            self.indent.pop()

            while self.indent:
                yield self.insert_token('DEDENT', token, pos)
                prev_indent = self.indent[-1]

                if cur_indent == prev_indent:
                    # indentation matches
                    return

                elif prev_indent.startswith(cur_indent):
                    # keep popping
                    self.indent.pop()

                else:
                    # it's not a match and current indent is no longer
                    # a proper prefix
                    raise EdgeIndentationError(
                        'Unexpected indentation level decrease on line {line}',
                        line=token.start.line,
                        col=token.start.column,
                        filename=self.filename)

            raise EdgeIndentationError(
                'Unexpected indentation level decrease on line {line}',
                line=token.start.line,
                col=token.start.column,
                filename=self.filename)

        else:
            # neither indentation is the prefix of the the other,
            # which is an error
            raise EdgeIndentationError(
                'Inconsistent indentation on line {line}',
                line=token.start.line,
                col=token.start.column,
                filename=self.filename)

    def token_generator(self, token):
        """Given the current lexer token, yield one or more tokens."""

        tok_type = token.type

        # update current possible indent
        if tok_type == 'RAWLEADWS' or (not self.logical_line_started
                                       and self._state == STATE_WS_SENSITIVE
                                       and tok_type == 'WS'):
            self.cur_indent = token.value

        # initialize the indentation if it's still empty
        if not self.indent:
            if tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}:
                self.indent.append(self.cur_indent or '')

        last_indent = self.indent[-1] if self.indent else None
        cur_indent = self.cur_indent

        # handle indentation
        if (self._state == STATE_WS_SENSITIVE and not self.logical_line_started
                and tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}):

            self.cur_indent = None

            # we have potential indentation change
            for t in self.process_indent(last_indent, cur_indent, token):
                yield t

        # indentation of raw strings
        elif self._state == STATE_RAW_STRING:
            if not self.logical_line_started and tok_type != 'NEWLINE':
                # we MUST indent here

                # we have potential indentation change
                if tok_type == 'RAWLEADWS':
                    for t in self.process_indent(last_indent,
                                                 cur_indent,
                                                 token,
                                                 allow_dedent=False,
                                                 pos='end'):
                        yield t

            elif tok_type == 'RAWLEADWS':
                dedented = False
                for t in self.process_indent(last_indent,
                                             cur_indent,
                                             token,
                                             skip_indent=True,
                                             pos='end'):

                    if not dedented:
                        yield self.insert_token('NL', token)
                        dedented = True

                    yield t

                if dedented:
                    self._next_state = STATE_WS_SENSITIVE
                    # alter the token type & adjust logical newline
                    token = token._replace(type='WS')
                    tok_type = 'WS'
                    self.logical_line_started = False

        # handle logical newline
        if tok_type not in {
                'NEWLINE', 'WS', 'COMMENT', 'LINECONT', 'ERROR', 'BADIDENT'
        }:
            self.logical_line_started = True

        elif (self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}
              and tok_type == 'NEWLINE'):

            # after any newline reset the indent
            self.cur_indent = ''
            # if there was a logical line, emit a special token
            if self.logical_line_started:
                yield self.insert_token('NL', token)
                self.logical_line_started = False

        if tok_type == 'LINECONT':
            # it is always an error to use line continuation mixed
            # into indentation
            if self.cur_indent is not None:
                # indentation level mismatch
                raise EdgeIndentationError(
                    'Illegal line continuation on line {line}',
                    line=token.start.line,
                    col=token.start.column,
                    filename=self.filename)

            if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}:
                token = token._replace(type='WS')
            else:
                token = token._replace(value='\n', type='RAWSTRING')

        yield token

    def lex(self):
        """Wrapper for the lexer."""

        self.indent = []
        self.state_stack = []
        self.logical_line_started = True
        self.prev_nw_tok = None  # previous NON-WHITESPACE token
        self.cur_indent = None
        self._next_state = None

        for tok in self._lex():
            # if we have any error token here, handle the error now
            if tok.type == 'BADIDENT':
                raise lexer.UnknownTokenError(
                    f"Illegal identifier '{tok.text}'",
                    line=tok.start.line,
                    col=tok.start.column,
                    filename=tok.filename)

            elif tok.type == 'ERROR':
                if self.prev_nw_tok.type == 'NL':
                    # if it looks like this happened right after a new
                    # line, attempt to generate dedents so that the
                    # parser may have a better error if the previous
                    # line cannot be parsed
                    for eoftok in self.get_eof_tokens():
                        yield eoftok

                raise lexer.UnknownTokenError(f"Unexpected '{tok.text}'",
                                              line=tok.start.line,
                                              col=tok.start.column,
                                              filename=tok.filename)

            if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}:
                self.prev_nw_tok = tok
            yield tok

    def _lex(self):
        """Lexes the src.

        Generator. Yields tokens (as defined by the rules).

        May yield special start and EOF tokens.
        May raise UnknownTokenError exception."""

        src = self.inputstr

        for tok in self.get_start_tokens():
            yield tok

        while self.start < self.end:
            for match in self.re_states[self._state].finditer(src, self.start):
                rule_id = match.lastgroup

                txt = match.group(rule_id)

                if rule_id == 'err':
                    rule = self.error_rule
                else:
                    rule = Rule._map[rule_id]

                rule_token = rule.token

                token = self.token_from_text(rule_token, txt)

                # the next state must be evaluated before yielding the
                # next token so that we have access to prevtok
                if rule.next_state:
                    # the next state can be callable
                    if callable(rule.next_state):
                        next_state = rule.next_state(self)
                    else:
                        next_state = rule.next_state
                else:
                    next_state = STATE_KEEP

                for tok in self.token_generator(token):
                    yield tok

                if next_state and next_state != self._state:
                    # Rule dictates that the lexer state should be
                    # switched
                    self._state = next_state
                    break
                elif self._next_state is not None:
                    self._state = self._next_state
                    self._next_state = None
                    break

        # End of file
        for tok in self.get_eof_tokens():
            yield tok

Example #2

Show file

File: lexer.py Project: versada/edgedb

class EdgeSchemaLexer(lexer.Lexer):

    start_state = STATE_WS_SENSITIVE

    NL = 'NEWLINE'
    MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT'))
    RE_FLAGS = re.X | re.M | re.I

    # a few reused rules
    string_rule = Rule(
        token='STRING',
        next_state=STATE_KEEP,
        regexp=r'''
           (?P<Q>
               # capture the opening quote in group Q
               (
                   ' | " |
                   {dollar_quote}
               )
           )
           (?:
               (\\['"] | \n | .)*?
           )
           (?P=Q)  # match closing quote type with whatever is in Q
        '''.format(dollar_quote=re_dquote))

    ident_rule = Rule(
        token='IDENT',
        next_state=STATE_KEEP,
        regexp=r'[^\W\d]\w*')

    qident_rule = Rule(
        token='QIDENT',
        next_state=STATE_KEEP,
        regexp=r'`.+?`')

    comment_rule = Rule(
        token='COMMENT',
        next_state=STATE_KEEP,
        regexp=r'\#[^\n]*$')

    line_cont_rule = Rule(
        token='LINECONT',
        next_state=STATE_KEEP,
        regexp=r'\\\n')

    bad_line_cont_rule = Rule(
        token='BADLINECONT',
        next_state=STATE_KEEP,
        regexp=r'\\.+?$')

    # Basic keywords
    keyword_rules = [Rule(token=tok[0],
                          next_state=STATE_KEEP,
                          regexp=lexer.group(val))
                     for val, tok in edge_schema_keywords.items()
                     if tok[0] not in {'LINK', 'TO', 'EXTENDING', 'ATTRIBUTE'}]

    common_rules = keyword_rules + [
        Rule(token='LINK',
             next_state=STATE_KEEP,
             regexp=r'\bLINK\b'),

        # need to handle 'EXTENDING' differently based on whether it's
        # followed by '('
        Rule(token='EXTENDING',
             next_state=STATE_RAW_TYPE,
             regexp=r'\bEXTENDING\b(?!$|\s*\()'),

        Rule(token='EXTENDING',
             next_state=STATE_KEEP,
             regexp=r'\bEXTENDING\b'),

        Rule(token='ATTRIBUTE',
             next_state=STATE_ATTRIBUTE_RAW_TYPE,
             regexp=r'\bATTRIBUTE\b'),

        comment_rule,

        Rule(token='WS',
             next_state=STATE_KEEP,
             regexp=r'[^\S\n]+'),

        line_cont_rule,
        bad_line_cont_rule,

        Rule(token='NEWLINE',
             next_state=STATE_KEEP,
             regexp=r'\n'),

        Rule(token='LPAREN',
             next_state=push_state(STATE_RAW_PAREN),
             regexp=r'\('),

        Rule(token='RPAREN',
             next_state=pop_state,
             regexp=r'\)'),

        Rule(token='COMMA',
             next_state=STATE_KEEP,
             regexp=r'\,'),

        Rule(token='DOUBLECOLON',
             next_state=STATE_KEEP,
             regexp=r'::'),

        Rule(token='TURNSTILE',
             next_state=STATE_RAW_STRING,
             regexp=r':='),

        Rule(token='COLONGT',
             next_state=STATE_RAW_STRING,
             regexp=r':>'),

        Rule(token='COLON',
             next_state=STATE_KEEP,
             regexp=r':'),

        # need to handle '->' differently based on whether it's
        # followed by '('
        Rule(token='ARROW',
             next_state=STATE_RAW_TYPE,
             regexp=r'->(?!$|\s*\()'),

        Rule(token='ARROW',
             next_state=STATE_KEEP,
             regexp=r'->'),

        Rule(token='DOT',
             next_state=STATE_KEEP,
             regexp=r'\.'),

        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),

        string_rule,
        ident_rule,
        qident_rule,
    ]

    states = {
        STATE_WS_SENSITIVE: list(common_rules),
        STATE_WS_INSENSITIVE: list(common_rules),
        STATE_RAW_PAREN: [
            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),

            Rule(token='RPAREN',
                 next_state=pop_state,
                 regexp=r'\)'),

            string_rule,
            qident_rule,
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^()`'"\\][^()`'"$\\]*'''),
        ],
        STATE_RAW_ANGLE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            Rule(token='RAWSTRING',
                 next_state=pop_state,
                 regexp=r'\>'),

            string_rule,
            qident_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''),
        ],
        STATE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'::'),

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            string_rule,
            qident_rule,
            comment_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''),
        ],
        STATE_RAW_STRING: [
            Rule(token='NEWLINE',
                 next_state=STATE_KEEP,
                 regexp=r'(?<=:[=>])\s*\n'),

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?<=:[=>])[^\n]+?$'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]*\n'),

            Rule(token='RAWLEADWS',
                 next_state=STATE_KEEP,
                 regexp=r'^[^\S\n]+'),

            # 0 indentation is the end of a raw string block
            Rule(token='RAWLEADWS',
                 next_state=STATE_KEEP,
                 regexp=r'^(?=\S)'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'.*?(?:\n|.$)'),
        ],
        STATE_ATTRIBUTE_RAW_TYPE: [
            line_cont_rule,
            bad_line_cont_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_WS_SENSITIVE,
                 regexp=r'(?=\n|:(?!:))'),

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'::'),

            Rule(token='RAWSTRING',
                 next_state=push_state(STATE_RAW_ANGLE),
                 regexp=r'\<'),

            Rule(token='LPAREN',
                 next_state=push_state(STATE_RAW_PAREN),
                 regexp=r'\('),

            string_rule,
            qident_rule,

            Rule(token='RAWSTRING',
                 next_state=STATE_KEEP,
                 regexp=r'''[^:<(`'"\n\\][^:<(`'"$\n\\]*'''),
        ],
    }

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def get_start_tokens(self):
        '''Yield a number of start tokens.'''
        return ()

    def get_eof_tokens(self):
        '''Yield a number of EOF tokens.'''
        if self.logical_line_started:
            yield self.token_from_text('NL', '')

        # decrease indentation level at the end of input
        while len(self.indent) > 1:
            self.indent.pop()
            yield self.token_from_text('DEDENT', '')

    def insert_token(self, toktype, token, pos='start'):
        return lexer.Token('', type=toktype, text='',
                           start=getattr(token, pos),
                           end=getattr(token, pos),
                           filename=self.filename)

    def token_generator(self, token):
        """Given the current lexer token, yield one or more tokens."""

        tok_type = token.type

        # initialize the indentation if it's still empty
        if not self.indent:
            if tok_type == 'WS':
                self.indent.append(token.end.column - 1)
            else:
                self.indent.append(0)

        # handle indentation
        if (self._state == STATE_WS_SENSITIVE and
                not self.logical_line_started and
                tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}):

            # we have potential indentation change
            last_indent = self.indent[-1]
            cur_indent = token.start.column - 1

            if cur_indent > last_indent:
                # increase indentation level
                self.indent.append(cur_indent)
                yield self.insert_token('INDENT', token)

            elif cur_indent < last_indent:
                # decrease indentation level
                while self.indent[-1] > cur_indent:
                    self.indent.pop()
                    if self.indent[-1] < cur_indent:
                        # indentation level mismatch
                        raise EdgeIndentationError(
                            'Incorrect unindent at {position}',
                            line=token.start.line,
                            col=token.start.column,
                            filename=self.filename)

                    yield self.insert_token('DEDENT', token)

        # indentation of raw strings
        elif self._state == STATE_RAW_STRING:
            last_indent = self.indent[-1]
            # only valid for RAWLEADWS
            cur_indent = len(token.value)

            if not self.logical_line_started and tok_type != 'NEWLINE':
                # we MUST indent here
                if (tok_type == 'RAWLEADWS' and
                        cur_indent > last_indent):
                    # increase indentation level
                    self.indent.append(cur_indent)
                    yield self.insert_token('INDENT', token, 'end')

                elif token.value.strip():
                    # indentation level mismatch
                    raise EdgeIndentationError(
                        'Incorrect indentation at {position}',
                        line=token.end.line,
                        col=token.end.column,
                        filename=self.filename)

            elif (tok_type == 'RAWLEADWS' and
                    cur_indent < last_indent):
                # check indentation level of each RAWLEADWS,
                # exiting the current state and issuing a NL and DEDENT
                # tokens if indentation falls below starting value
                yield self.insert_token('NL', token)

                while self.indent[-1] > cur_indent:
                    self.indent.pop()
                    if self.indent[-1] < cur_indent:
                        # indentation level mismatch
                        raise EdgeIndentationError(
                            'Incorrect unindent at {position}',
                            line=token.end.line,
                            col=token.end.column,
                            filename=self.filename)

                    yield self.insert_token('DEDENT', token, 'end')

                self._next_state = STATE_WS_SENSITIVE
                # alter the token type & adjust logical newline
                token = token._replace(type='WS')
                tok_type = 'WS'
                self.logical_line_started = False

        # handle logical newline
        if (self.logical_line_started and
                self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING} and
                tok_type == 'NEWLINE'):
            yield self.insert_token('NL', token)
            self.logical_line_started = False

        elif tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}:
            self.logical_line_started = True

        if tok_type == 'LINECONT':
            if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}:
                token = token._replace(type='WS')
            else:
                token = token._replace(value='\n', type='RAWSTRING')

        yield token

    def lex(self):
        """Wrapper for the lexer."""

        self.indent = []
        self.state_stack = []
        self.logical_line_started = True
        self.prev_nw_tok = None  # previous NON-WHITEPASE token
        self._next_state = None

        for tok in self._lex():
            if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}:
                self.prev_nw_tok = tok
            yield tok

    def _lex(self):
        """Lexes the src.

        Generator. Yields tokens (as defined by the rules).

        May yield special start and EOF tokens.
        May raise UnknownTokenError exception."""

        src = self.inputstr

        for tok in self.get_start_tokens():
            yield tok

        while self.start < self.end:
            for match in self.re_states[self._state].finditer(src, self.start):
                rule_id = match.lastgroup

                txt = match.group(rule_id)

                if rule_id == 'err':
                    # Error group -- no rule has been matched
                    self.handle_error(txt)

                rule = Rule._map[rule_id]
                rule_token = rule.token

                token = self.token_from_text(rule_token, txt)

                # the next state must be evaluated before yielding the
                # next token so that we have access to prevtok
                if rule.next_state:
                    # the next state can be callable
                    if callable(rule.next_state):
                        next_state = rule.next_state(self)
                    else:
                        next_state = rule.next_state
                else:
                    next_state = STATE_KEEP

                for tok in self.token_generator(token):
                    yield tok

                if next_state and next_state != self._state:
                    # Rule dictates that the lexer state should be
                    # switched
                    self._state = next_state
                    break
                elif self._next_state is not None:
                    self._state = self._next_state
                    self._next_state = None
                    break

        # End of file
        for tok in self.get_eof_tokens():
            yield tok

Example #3

Show file

File: lexer.py Project: dungeon2567/edgedb

class EdgeQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    MERGE_TOKENS = {('NAMED', 'ONLY'), ('SET', 'ATTRIBUTE')}

    NL = 'NL'
    MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST', 'RSCONST'))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [Rule(token=tok[0],
                          next_state=STATE_KEEP,
                          regexp=lexer.group(val))
                     for val, tok in edgeql_keywords.items()]

    common_rules = keyword_rules + [
        Rule(token='WS',
             next_state=STATE_KEEP,
             regexp=r'[^\S\n]+'),

        Rule(token='NL',
             next_state=STATE_KEEP,
             regexp=r'\n'),

        Rule(token='COMMENT',
             next_state=STATE_KEEP,
             regexp=r'''\#.*?$'''),

        Rule(token='ASSIGN',
             next_state=STATE_KEEP,
             regexp=r':='),

        Rule(token='ARROW',
             next_state=STATE_KEEP,
             regexp=r'->'),

        Rule(token='??',
             next_state=STATE_KEEP,
             regexp=r'\?\?'),

        Rule(token='::',
             next_state=STATE_KEEP,
             regexp=r'::'),

        # special path operators
        Rule(token='.<',
             next_state=STATE_KEEP,
             regexp=r'\.<'),

        Rule(token='.>',
             next_state=STATE_KEEP,
             regexp=r'\.>'),

        Rule(token='//',
             next_state=STATE_KEEP,
             regexp=r'//'),

        Rule(token='++',
             next_state=STATE_KEEP,
             regexp=r'\+\+'),

        Rule(token='OP',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: >= | <= | != | \?= | \?!=)
             '''),

        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'),

        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d+)?
                        (?:[eE](?:[+\-])?[0-9]+)
                    )
                    |
                    (?: \d+\.\d+)
                """),

        Rule(token='ICONST',
             next_state=STATE_KEEP,
             regexp=r'([1-9]\d* | 0)(?![0-9])'),

        Rule(token='BCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?:
                    b
                )
                (?P<BQ>
                    ' | "
                )
                (?:
                    (
                        \\\\ | \\['"] | \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=BQ)
             '''),

        Rule(token='RSCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?:
                    r
                )?
                (?P<RQ>
                    (?:
                        (?<=r) (?: ' | ")
                    ) | (?:
                        (?<!r) (?: {re_dquote})
                    )
                )
                (?:
                    (
                        \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=RQ)
             '''),

        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?P<Q>
                    ' | "
                )
                (?:
                    (
                        \\\\ | \\['"] | \n | .
                        # we'll validate escape codes in the parser
                    )*?
                )
                (?P=Q)
             '''),

        # this rule will capture malformed strings and allow us to
        # provide better error messages
        Rule(token='BADSCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                [rb]?
                (['"] | (?: {re_dquote}))
                [^\n]*
             '''),

        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),

        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'[^\W\d]\w*'),

        Rule(token='QIDENT',
             next_state=STATE_KEEP,
             regexp=r'`[^@].*?`'),

        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[\{\}$]'),
    ]

    states = {
        STATE_BASE:
            common_rules,
    }

    def __init__(self):
        super().__init__()
        # add capacity to handle a few tokens composed of 2 elements
        self._possible_long_token = {x[0] for x in self.MERGE_TOKENS}
        self._long_token_match = {x[1]: x[0] for x in self.MERGE_TOKENS}

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADSCONST':
            raise lexer.UnknownTokenError(
                f"Unterminated string {txt}",
                line=self.lineno, col=self.column, filename=self.filename)
        elif rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def lex(self):
        buffer = []

        for tok in super().lex():
            tok_type = tok.type

            if tok_type in {'WS', 'NL', 'COMMENT'}:
                # Strip out whitespace and comments
                continue

            elif tok_type in self._possible_long_token:
                # Buffer in case this is a merged token
                if not buffer:
                    buffer.append(tok)
                else:
                    yield from iter(buffer)
                    buffer[:] = [tok]

            elif tok_type in self._long_token_match:
                prev_token = buffer[-1] if buffer else None
                if (prev_token and
                        prev_token.type == self._long_token_match[tok_type]):
                    tok = prev_token._replace(
                        value=prev_token.value + ' ' + tok.value,
                        type=prev_token.type + tok_type)
                    buffer.pop()
                yield tok

            else:
                if buffer:
                    yield from iter(buffer)
                    buffer[:] = []
                yield tok

    def lex_highlight(self):
        return super().lex()

Example #4

Show file

class EdgeQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = 'NL'
    MULTILINE_TOKENS = frozenset(('SCONST', ))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in edgeql_keywords.items()
    ]

    common_rules = keyword_rules + [
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'''\#.*?$'''),
        Rule(token='TURNSTILE', next_state=STATE_KEEP, regexp=r':='),
        Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'),
        Rule(token='??', next_state=STATE_KEEP, regexp=r'\?\?'),
        Rule(token='::', next_state=STATE_KEEP, regexp=r'::'),

        # special path operators
        Rule(token='.<', next_state=STATE_KEEP, regexp=r'\.<'),
        Rule(token='.>', next_state=STATE_KEEP, regexp=r'\.>'),
        Rule(token='OP',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: >= | <= | != | \?= | \?!=)
             '''),

        # SQL ops
        Rule(token='self',
             next_state=STATE_KEEP,
             regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d+)?
                        (?:[eE](?:[+\-])?[0-9]+)
                    )
                    |
                    (?: \d+\.\d+)
                """),
        Rule(token='ICONST',
             next_state=STATE_KEEP,
             regexp=r'([1-9]\d* | 0)(?![0-9])'),
        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=rf'''
                (?P<Q>
                    # capture the opening quote in group Q
                    (
                        ' | " |
                        {re_dquote}
                    )
                )
                (?:
                    (\\['"] | \n | .)*?
                )
                (?P=Q)      # match closing quote type with whatever is in Q
             '''),
        Rule(token='BADIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    __[^\W\d]\w*__
                    |
                    `__.*?__`
                '''),
        Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*'),
        Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`[^@].*?`'),
        Rule(token='self', next_state=STATE_KEEP, regexp=r'[\{\}$]'),
    ]

    states = {
        STATE_BASE: common_rules,
    }

    def token_from_text(self, rule_token, txt):
        if rule_token == 'BADIDENT':
            self.handle_error(txt)

        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[1:-1])

        return tok

    def lex(self):
        buffer = []

        for tok in super().lex():
            tok_type = tok.type

            if tok_type in {'WS', 'NL', 'COMMENT'}:
                # Strip out whitespace and comments
                continue
            else:
                if buffer:
                    yield from iter(buffer)
                    buffer[:] = []
                yield tok

    def lex_highlight(self):
        return super().lex()

Example #5

Show file

class PgSQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = frozenset('NL')
    MULTILINE_TOKENS = frozenset(('COMMENT', 'SCONST'))
    RE_FLAGS = re.X | re.M | re.I

    # Basic keywords
    keyword_rules = [
        Rule(token='KEYWORD',
             next_state=STATE_KEEP,
             regexp=lexer.group(*pg_keywords.keys()))
    ]

    common_rules = keyword_rules + [
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'),
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'),
        Rule(token='COMMENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:/\*(?:.|\n)*?\*/)
                    | (?:--.*?$)
                '''),
        Rule(token='TYPECAST', next_state=STATE_KEEP, regexp=r'::'),

        # multichar ops (so 2+ chars)
        Rule(token='Op',
             next_state=STATE_KEEP,
             regexp=r'''
                # EdgeQL-specific multi-char ops
                {opchar_pg} (?:{opchar}(?!/\*|--))+
                |
                (?:{opchar}(?!/\*|--))+ {opchar_pg} (?:{opchar}(?!/\*|--))*
                |
                # SQL-only multi-char ops cannot end in + or -
                (?:{opchar_sql}(?!/\*|--))+[*/^%<>=]
             '''.format(opchar_pg=re_opchars_pgsql,
                        opchar=re_opchars,
                        opchar_sql=re_opchars_sql)),

        # PgSQL single char ops
        Rule(token='Op', next_state=STATE_KEEP, regexp=re_opchars_pgsql),

        # SQL ops
        Rule(token='self', next_state=STATE_KEEP, regexp=re_self),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r"""
                    (?: \d+ (?:\.\d*)?
                        |
                        \. \d+
                    ) {exppart}
                """.format(exppart=re_exppart)),
        Rule(token='FCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                (?: \d+\.(?!\.)\d*
                    |
                    \.\d+)
             '''),
        Rule(token='ICONST', next_state=STATE_KEEP, regexp=r'\d+'),
        Rule(token='BCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                B'(?:
                    [01]
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),
        Rule(token='XCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                X'(?:
                    [\da-fA-F]
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),

        # don't have extra checks for correct escaping inside
        Rule(token='SCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                [nNeE]?
                '(?:
                    [^']
                    |
                    ''
                    |
                    ' (?:\s*\n\s*) '
                )*'
             '''),

        # dollar quoted strings
        Rule(token='DQCONST',
             next_state=STATE_KEEP,
             regexp=r'''
                \$(?P<dq> (?:{ident_start}{ident_cont}*)? )\$
                    .*?
                \$(?P=dq)\$
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),

        # specifying custom escape character
        Rule(token='UESCAPE',
             next_state=STATE_KEEP,
             regexp=r"""UESCAPE\s+'[^a-fA-F\d\s+'"]'"""),

        # quoted identifier
        Rule(token='QIDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:U&)?
                    "(?:
                        [^"]
                        |
                        ""
                    )+"
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),
        Rule(token='PARAM', next_state=STATE_KEEP, regexp=r'\$\d+'),
        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'''
                    {ident_start}{ident_cont}*
                '''.format(ident_start=re_ident_start,
                           ident_cont=re_ident_cont)),
    ]

    states = {
        STATE_BASE: common_rules,
    }

    def token_from_text(self, rule_token, txt):
        tok = super().token_from_text(rule_token, txt)

        if rule_token == 'self':
            tok = tok._replace(type=txt)

        elif rule_token == 'IDENT':
            tok = tok._replace(value=txt.lower())

        elif rule_token == 'KEYWORD':
            # process keywords here since having separate rules for them
            # creates > 100 re groups.
            txt_low = txt.lower()
            tok = tok._replace(value=txt_low, type=pg_keywords[txt_low][0])

        elif rule_token in ('SCONST', 'BCONST', 'XCONST'):
            txt = txt[:-1].split("'", 1)[1]
            txt = clean_string.sub('', txt.replace("''", "'"))
            tok = tok._replace(value=txt)

        elif rule_token == 'PARAM':
            tok = tok._replace(value=txt[1:])

        elif rule_token == 'QIDENT':
            tok = tok._replace(type='IDENT', value=txt[:-1].split('"', 1)[1])

        elif rule_token == 'DQCONST':
            txt = txt.rsplit("$", 2)[2]
            txt = txt.split("$", 2)[2]
            tok = tok._replace(type='SCONST', value=txt)

        return tok

Example #6

Show file

File: lexer.py Project: versada/edgedb

class GraphQLLexer(lexer.Lexer):

    start_state = STATE_BASE

    NL = 'NL'
    RE_FLAGS = re.X | re.M

    # Basic keywords
    keyword_rules = [
        Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val))
        for val, tok in graphql_keywords.items()
    ]

    common_rules = keyword_rules + [
        Rule(token='NL', next_state=STATE_KEEP, regexp=r'\r\n|\n|\r'),
        Rule(token='WS', next_state=STATE_KEEP, regexp=r'[ \t]+'),
        Rule(token='COMMA', next_state=STATE_KEEP, regexp=r','),
        Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'\#[^\n]*$'),
        Rule(token='LPAREN', next_state=STATE_KEEP, regexp=r'\('),
        Rule(token='RPAREN', next_state=STATE_KEEP, regexp=r'\)'),
        Rule(token='LSBRACKET', next_state=STATE_KEEP, regexp=r'\['),
        Rule(token='RSBRACKET', next_state=STATE_KEEP, regexp=r'\]'),
        Rule(token='LCBRACKET', next_state=STATE_KEEP, regexp=r'\{'),
        Rule(token='RCBRACKET', next_state=STATE_KEEP, regexp=r'\}'),
        Rule(token='BANG', next_state=STATE_KEEP, regexp=r'\!'),
        Rule(token='ELLIPSIS', next_state=STATE_KEEP, regexp=r'\.\.\.'),
        Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'),
        Rule(token='EQUAL', next_state=STATE_KEEP, regexp=r'='),
        Rule(token='AT', next_state=STATE_KEEP, regexp=r'@'),
        Rule(token='INTEGER',
             next_state=STATE_KEEP,
             regexp=r'-?(?:0|[1-9][0-9]*)(?![eE.0-9])'),
        Rule(token='FLOAT',
             next_state=STATE_KEEP,
             regexp=r'''
                -?(0|[1-9][0-9]*)
                    (\.[0-9]+)?
                        ([eE][+-]?[0-9]+)?
                        (?![eE.0-9])  # must not be followed by a number
             '''),
        Rule(token='STRING',
             next_state=STATE_KEEP,
             regexp=r'''
                    (?:r)?" [^\n]*?
                    (?<!\\)"
             '''),
        Rule(token='IDENT',
             next_state=STATE_KEEP,
             regexp=r'[_A-Za-z][_0-9A-Za-z]*'),
        Rule(token='VAR', next_state=STATE_KEEP, regexp=r'\$[_0-9A-Za-z]+'),
        Rule(token='DOLLAR', next_state=STATE_KEEP, regexp=r'\$'),
    ]

    states = {
        STATE_BASE: list(common_rules),
    }

    def handle_error(self, txt):
        # check if this is unterminated string instead of a generic error
        if txt == '"':

            pos = re.compile(r'$',
                             self.RE_FLAGS).search(self.inputstr,
                                                   self.start).start()
            pos += self.column - self.start
            raise UnterminatedStringError(
                'unterminated string token {position}',
                line=self.lineno,
                col=pos,
                filename=self.filename)

        super().handle_error(txt)