class EdgeSchemaLexer(lexer.Lexer): start_state = STATE_WS_SENSITIVE NL = 'NEWLINE' MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT')) RE_FLAGS = re.X | re.M | re.I # special placeholder error rule, that's not meant to be # explicitly inserted into the lexical productions error_rule = Rule( token='ERROR', next_state=STATE_KEEP, regexp=None, ) # a few reused rules string_rule = Rule(token='STRING', next_state=STATE_KEEP, regexp=r''' (?P<Q> # capture the opening quote in group Q ( ' | " | {dollar_quote} ) ) (?: (\\['"] | \n | .)*? ) (?P=Q) # match closing quote type with whatever is in Q '''.format(dollar_quote=re_dquote)) ident_rule = Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*') qident_rule = Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`.+?`') comment_rule = Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'\#[^\n]*$') line_cont_rule = Rule(token='LINECONT', next_state=STATE_KEEP, regexp=r'\\\n') bad_line_cont_rule = Rule(token='BADLINECONT', next_state=STATE_KEEP, regexp=r'\\.+?$') # Basic keywords keyword_rules = [ Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val)) for val, tok in edge_schema_keywords.items() if tok[0] not in {'LINK', 'TO', 'EXTENDING'} ] common_rules = keyword_rules + [ Rule(token='LINK', next_state=STATE_KEEP, regexp=r'\bLINK\b'), # need to handle 'EXTENDING' differently based on whether it's # followed by '(' Rule(token='EXTENDING', next_state=STATE_RAW_TYPE, regexp=r'\bEXTENDING\b(?!$|\s*\()'), Rule(token='EXTENDING', next_state=STATE_KEEP, regexp=r'\bEXTENDING\b'), comment_rule, Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'), line_cont_rule, bad_line_cont_rule, Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'\n'), Rule(token='LPAREN', next_state=push_state(STATE_RAW_PAREN), regexp=r'\('), Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'), Rule(token='COMMA', next_state=STATE_KEEP, regexp=r'\,'), Rule(token='DOUBLECOLON', next_state=STATE_KEEP, regexp=r'::'), Rule(token='ASSIGN', next_state=STATE_RAW_STRING, regexp=r':='), Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'), # need to handle '->' differently based on whether it's # followed by '(' Rule(token='ARROW', next_state=STATE_RAW_TYPE, regexp=r'->(?!$|\s*\()'), Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'), Rule(token='DOT', next_state=STATE_KEEP, regexp=r'\.'), Rule(token='BADIDENT', next_state=STATE_KEEP, regexp=r''' __[^\W\d]\w*__ | `__.*?__` '''), string_rule, ident_rule, qident_rule, ] states = { STATE_WS_SENSITIVE: list(common_rules), STATE_WS_INSENSITIVE: list(common_rules), STATE_RAW_PAREN: [ Rule(token='LPAREN', next_state=push_state(STATE_RAW_PAREN), regexp=r'\('), Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'), string_rule, qident_rule, line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^()`'"\\][^()`'"$\\]*'''), ], STATE_RAW_ANGLE: [ line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=push_state(STATE_RAW_ANGLE), regexp=r'\<'), Rule(token='RAWSTRING', next_state=pop_state, regexp=r'\>'), string_rule, qident_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''), ], STATE_RAW_TYPE: [ line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=STATE_WS_SENSITIVE, regexp=r'(?=\n|:(?!:))'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'::'), Rule(token='RAWSTRING', next_state=push_state(STATE_RAW_ANGLE), regexp=r'\<'), string_rule, qident_rule, comment_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''), ], STATE_RAW_STRING: [ Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'(?<=:[=>])\s*\n'), Rule(token='RAWSTRING', next_state=STATE_WS_SENSITIVE, regexp=r'(?<=:[=>])[^\n]+?$'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'^[^\S\n]*\n'), Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^[^\S\n]+'), # 0 indentation is the end of a raw string block Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^(?=\S)'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'.*?(?:\n|.$)'), ], } def token_from_text(self, rule_token, txt): tok = super().token_from_text(rule_token, txt) if rule_token == 'QIDENT': tok = tok._replace(type='IDENT', value=txt[1:-1]) return tok def get_start_tokens(self): '''Yield a number of start tokens.''' return () def get_eof_tokens(self): '''Yield a number of EOF tokens.''' if self.logical_line_started: yield self.token_from_text('NL', '') # decrease indentation level at the end of input while len(self.indent) > 1: self.indent.pop() yield self.token_from_text('DEDENT', '') def insert_token(self, toktype, token, pos='start'): return lexer.Token('', type=toktype, text='', start=getattr(token, pos), end=getattr(token, pos), filename=self.filename) def process_indent(self, last_indent, cur_indent, token, *, skip_indent=False, allow_indent=True, allow_dedent=True, pos='start'): # first and foremost, the indentation cannot have tabs after a space if ' \t' in cur_indent: raise EdgeIndentationError('Tabs used after spaces on line {line}', line=token.start.line, col=token.start.column, filename=self.filename) if cur_indent == last_indent: # indentation matches pass elif allow_indent and cur_indent.startswith(last_indent): if not skip_indent: # increase indentation level self.indent.append(cur_indent) yield self.insert_token('INDENT', token, pos) elif allow_dedent and last_indent.startswith(cur_indent): # decrease indentation level self.indent.pop() while self.indent: yield self.insert_token('DEDENT', token, pos) prev_indent = self.indent[-1] if cur_indent == prev_indent: # indentation matches return elif prev_indent.startswith(cur_indent): # keep popping self.indent.pop() else: # it's not a match and current indent is no longer # a proper prefix raise EdgeIndentationError( 'Unexpected indentation level decrease on line {line}', line=token.start.line, col=token.start.column, filename=self.filename) raise EdgeIndentationError( 'Unexpected indentation level decrease on line {line}', line=token.start.line, col=token.start.column, filename=self.filename) else: # neither indentation is the prefix of the the other, # which is an error raise EdgeIndentationError( 'Inconsistent indentation on line {line}', line=token.start.line, col=token.start.column, filename=self.filename) def token_generator(self, token): """Given the current lexer token, yield one or more tokens.""" tok_type = token.type # update current possible indent if tok_type == 'RAWLEADWS' or (not self.logical_line_started and self._state == STATE_WS_SENSITIVE and tok_type == 'WS'): self.cur_indent = token.value # initialize the indentation if it's still empty if not self.indent: if tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}: self.indent.append(self.cur_indent or '') last_indent = self.indent[-1] if self.indent else None cur_indent = self.cur_indent # handle indentation if (self._state == STATE_WS_SENSITIVE and not self.logical_line_started and tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}): self.cur_indent = None # we have potential indentation change for t in self.process_indent(last_indent, cur_indent, token): yield t # indentation of raw strings elif self._state == STATE_RAW_STRING: if not self.logical_line_started and tok_type != 'NEWLINE': # we MUST indent here # we have potential indentation change if tok_type == 'RAWLEADWS': for t in self.process_indent(last_indent, cur_indent, token, allow_dedent=False, pos='end'): yield t elif tok_type == 'RAWLEADWS': dedented = False for t in self.process_indent(last_indent, cur_indent, token, skip_indent=True, pos='end'): if not dedented: yield self.insert_token('NL', token) dedented = True yield t if dedented: self._next_state = STATE_WS_SENSITIVE # alter the token type & adjust logical newline token = token._replace(type='WS') tok_type = 'WS' self.logical_line_started = False # handle logical newline if tok_type not in { 'NEWLINE', 'WS', 'COMMENT', 'LINECONT', 'ERROR', 'BADIDENT' }: self.logical_line_started = True elif (self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING} and tok_type == 'NEWLINE'): # after any newline reset the indent self.cur_indent = '' # if there was a logical line, emit a special token if self.logical_line_started: yield self.insert_token('NL', token) self.logical_line_started = False if tok_type == 'LINECONT': # it is always an error to use line continuation mixed # into indentation if self.cur_indent is not None: # indentation level mismatch raise EdgeIndentationError( 'Illegal line continuation on line {line}', line=token.start.line, col=token.start.column, filename=self.filename) if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}: token = token._replace(type='WS') else: token = token._replace(value='\n', type='RAWSTRING') yield token def lex(self): """Wrapper for the lexer.""" self.indent = [] self.state_stack = [] self.logical_line_started = True self.prev_nw_tok = None # previous NON-WHITESPACE token self.cur_indent = None self._next_state = None for tok in self._lex(): # if we have any error token here, handle the error now if tok.type == 'BADIDENT': raise lexer.UnknownTokenError( f"Illegal identifier '{tok.text}'", line=tok.start.line, col=tok.start.column, filename=tok.filename) elif tok.type == 'ERROR': if self.prev_nw_tok.type == 'NL': # if it looks like this happened right after a new # line, attempt to generate dedents so that the # parser may have a better error if the previous # line cannot be parsed for eoftok in self.get_eof_tokens(): yield eoftok raise lexer.UnknownTokenError(f"Unexpected '{tok.text}'", line=tok.start.line, col=tok.start.column, filename=tok.filename) if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}: self.prev_nw_tok = tok yield tok def _lex(self): """Lexes the src. Generator. Yields tokens (as defined by the rules). May yield special start and EOF tokens. May raise UnknownTokenError exception.""" src = self.inputstr for tok in self.get_start_tokens(): yield tok while self.start < self.end: for match in self.re_states[self._state].finditer(src, self.start): rule_id = match.lastgroup txt = match.group(rule_id) if rule_id == 'err': rule = self.error_rule else: rule = Rule._map[rule_id] rule_token = rule.token token = self.token_from_text(rule_token, txt) # the next state must be evaluated before yielding the # next token so that we have access to prevtok if rule.next_state: # the next state can be callable if callable(rule.next_state): next_state = rule.next_state(self) else: next_state = rule.next_state else: next_state = STATE_KEEP for tok in self.token_generator(token): yield tok if next_state and next_state != self._state: # Rule dictates that the lexer state should be # switched self._state = next_state break elif self._next_state is not None: self._state = self._next_state self._next_state = None break # End of file for tok in self.get_eof_tokens(): yield tok
class EdgeSchemaLexer(lexer.Lexer): start_state = STATE_WS_SENSITIVE NL = 'NEWLINE' MULTILINE_TOKENS = frozenset(('STRING', 'RAWSTRING', 'LINECONT')) RE_FLAGS = re.X | re.M | re.I # a few reused rules string_rule = Rule( token='STRING', next_state=STATE_KEEP, regexp=r''' (?P<Q> # capture the opening quote in group Q ( ' | " | {dollar_quote} ) ) (?: (\\['"] | \n | .)*? ) (?P=Q) # match closing quote type with whatever is in Q '''.format(dollar_quote=re_dquote)) ident_rule = Rule( token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*') qident_rule = Rule( token='QIDENT', next_state=STATE_KEEP, regexp=r'`.+?`') comment_rule = Rule( token='COMMENT', next_state=STATE_KEEP, regexp=r'\#[^\n]*$') line_cont_rule = Rule( token='LINECONT', next_state=STATE_KEEP, regexp=r'\\\n') bad_line_cont_rule = Rule( token='BADLINECONT', next_state=STATE_KEEP, regexp=r'\\.+?$') # Basic keywords keyword_rules = [Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val)) for val, tok in edge_schema_keywords.items() if tok[0] not in {'LINK', 'TO', 'EXTENDING', 'ATTRIBUTE'}] common_rules = keyword_rules + [ Rule(token='LINK', next_state=STATE_KEEP, regexp=r'\bLINK\b'), # need to handle 'EXTENDING' differently based on whether it's # followed by '(' Rule(token='EXTENDING', next_state=STATE_RAW_TYPE, regexp=r'\bEXTENDING\b(?!$|\s*\()'), Rule(token='EXTENDING', next_state=STATE_KEEP, regexp=r'\bEXTENDING\b'), Rule(token='ATTRIBUTE', next_state=STATE_ATTRIBUTE_RAW_TYPE, regexp=r'\bATTRIBUTE\b'), comment_rule, Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'), line_cont_rule, bad_line_cont_rule, Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'\n'), Rule(token='LPAREN', next_state=push_state(STATE_RAW_PAREN), regexp=r'\('), Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'), Rule(token='COMMA', next_state=STATE_KEEP, regexp=r'\,'), Rule(token='DOUBLECOLON', next_state=STATE_KEEP, regexp=r'::'), Rule(token='TURNSTILE', next_state=STATE_RAW_STRING, regexp=r':='), Rule(token='COLONGT', next_state=STATE_RAW_STRING, regexp=r':>'), Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'), # need to handle '->' differently based on whether it's # followed by '(' Rule(token='ARROW', next_state=STATE_RAW_TYPE, regexp=r'->(?!$|\s*\()'), Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'), Rule(token='DOT', next_state=STATE_KEEP, regexp=r'\.'), Rule(token='BADIDENT', next_state=STATE_KEEP, regexp=r''' __[^\W\d]\w*__ | `__.*?__` '''), string_rule, ident_rule, qident_rule, ] states = { STATE_WS_SENSITIVE: list(common_rules), STATE_WS_INSENSITIVE: list(common_rules), STATE_RAW_PAREN: [ Rule(token='LPAREN', next_state=push_state(STATE_RAW_PAREN), regexp=r'\('), Rule(token='RPAREN', next_state=pop_state, regexp=r'\)'), string_rule, qident_rule, line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^()`'"\\][^()`'"$\\]*'''), ], STATE_RAW_ANGLE: [ line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=push_state(STATE_RAW_ANGLE), regexp=r'\<'), Rule(token='RAWSTRING', next_state=pop_state, regexp=r'\>'), string_rule, qident_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^<>`'"\\][^<>`'"$\\]*'''), ], STATE_RAW_TYPE: [ line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=STATE_WS_SENSITIVE, regexp=r'(?=\n|:(?!:))'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'::'), Rule(token='RAWSTRING', next_state=push_state(STATE_RAW_ANGLE), regexp=r'\<'), string_rule, qident_rule, comment_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^:<`'"\n\\#][^:<`'"$\n\\#]*'''), ], STATE_RAW_STRING: [ Rule(token='NEWLINE', next_state=STATE_KEEP, regexp=r'(?<=:[=>])\s*\n'), Rule(token='RAWSTRING', next_state=STATE_WS_SENSITIVE, regexp=r'(?<=:[=>])[^\n]+?$'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'^[^\S\n]*\n'), Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^[^\S\n]+'), # 0 indentation is the end of a raw string block Rule(token='RAWLEADWS', next_state=STATE_KEEP, regexp=r'^(?=\S)'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'.*?(?:\n|.$)'), ], STATE_ATTRIBUTE_RAW_TYPE: [ line_cont_rule, bad_line_cont_rule, Rule(token='RAWSTRING', next_state=STATE_WS_SENSITIVE, regexp=r'(?=\n|:(?!:))'), Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'::'), Rule(token='RAWSTRING', next_state=push_state(STATE_RAW_ANGLE), regexp=r'\<'), Rule(token='LPAREN', next_state=push_state(STATE_RAW_PAREN), regexp=r'\('), string_rule, qident_rule, Rule(token='RAWSTRING', next_state=STATE_KEEP, regexp=r'''[^:<(`'"\n\\][^:<(`'"$\n\\]*'''), ], } def token_from_text(self, rule_token, txt): if rule_token == 'BADIDENT': self.handle_error(txt) tok = super().token_from_text(rule_token, txt) if rule_token == 'QIDENT': tok = tok._replace(type='IDENT', value=txt[1:-1]) return tok def get_start_tokens(self): '''Yield a number of start tokens.''' return () def get_eof_tokens(self): '''Yield a number of EOF tokens.''' if self.logical_line_started: yield self.token_from_text('NL', '') # decrease indentation level at the end of input while len(self.indent) > 1: self.indent.pop() yield self.token_from_text('DEDENT', '') def insert_token(self, toktype, token, pos='start'): return lexer.Token('', type=toktype, text='', start=getattr(token, pos), end=getattr(token, pos), filename=self.filename) def token_generator(self, token): """Given the current lexer token, yield one or more tokens.""" tok_type = token.type # initialize the indentation if it's still empty if not self.indent: if tok_type == 'WS': self.indent.append(token.end.column - 1) else: self.indent.append(0) # handle indentation if (self._state == STATE_WS_SENSITIVE and not self.logical_line_started and tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}): # we have potential indentation change last_indent = self.indent[-1] cur_indent = token.start.column - 1 if cur_indent > last_indent: # increase indentation level self.indent.append(cur_indent) yield self.insert_token('INDENT', token) elif cur_indent < last_indent: # decrease indentation level while self.indent[-1] > cur_indent: self.indent.pop() if self.indent[-1] < cur_indent: # indentation level mismatch raise EdgeIndentationError( 'Incorrect unindent at {position}', line=token.start.line, col=token.start.column, filename=self.filename) yield self.insert_token('DEDENT', token) # indentation of raw strings elif self._state == STATE_RAW_STRING: last_indent = self.indent[-1] # only valid for RAWLEADWS cur_indent = len(token.value) if not self.logical_line_started and tok_type != 'NEWLINE': # we MUST indent here if (tok_type == 'RAWLEADWS' and cur_indent > last_indent): # increase indentation level self.indent.append(cur_indent) yield self.insert_token('INDENT', token, 'end') elif token.value.strip(): # indentation level mismatch raise EdgeIndentationError( 'Incorrect indentation at {position}', line=token.end.line, col=token.end.column, filename=self.filename) elif (tok_type == 'RAWLEADWS' and cur_indent < last_indent): # check indentation level of each RAWLEADWS, # exiting the current state and issuing a NL and DEDENT # tokens if indentation falls below starting value yield self.insert_token('NL', token) while self.indent[-1] > cur_indent: self.indent.pop() if self.indent[-1] < cur_indent: # indentation level mismatch raise EdgeIndentationError( 'Incorrect unindent at {position}', line=token.end.line, col=token.end.column, filename=self.filename) yield self.insert_token('DEDENT', token, 'end') self._next_state = STATE_WS_SENSITIVE # alter the token type & adjust logical newline token = token._replace(type='WS') tok_type = 'WS' self.logical_line_started = False # handle logical newline if (self.logical_line_started and self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING} and tok_type == 'NEWLINE'): yield self.insert_token('NL', token) self.logical_line_started = False elif tok_type not in {'NEWLINE', 'WS', 'COMMENT', 'LINECONT'}: self.logical_line_started = True if tok_type == 'LINECONT': if self._state in {STATE_WS_SENSITIVE, STATE_RAW_STRING}: token = token._replace(type='WS') else: token = token._replace(value='\n', type='RAWSTRING') yield token def lex(self): """Wrapper for the lexer.""" self.indent = [] self.state_stack = [] self.logical_line_started = True self.prev_nw_tok = None # previous NON-WHITEPASE token self._next_state = None for tok in self._lex(): if tok.type not in {'NEWLINE', 'WS', 'LINECONT'}: self.prev_nw_tok = tok yield tok def _lex(self): """Lexes the src. Generator. Yields tokens (as defined by the rules). May yield special start and EOF tokens. May raise UnknownTokenError exception.""" src = self.inputstr for tok in self.get_start_tokens(): yield tok while self.start < self.end: for match in self.re_states[self._state].finditer(src, self.start): rule_id = match.lastgroup txt = match.group(rule_id) if rule_id == 'err': # Error group -- no rule has been matched self.handle_error(txt) rule = Rule._map[rule_id] rule_token = rule.token token = self.token_from_text(rule_token, txt) # the next state must be evaluated before yielding the # next token so that we have access to prevtok if rule.next_state: # the next state can be callable if callable(rule.next_state): next_state = rule.next_state(self) else: next_state = rule.next_state else: next_state = STATE_KEEP for tok in self.token_generator(token): yield tok if next_state and next_state != self._state: # Rule dictates that the lexer state should be # switched self._state = next_state break elif self._next_state is not None: self._state = self._next_state self._next_state = None break # End of file for tok in self.get_eof_tokens(): yield tok
class EdgeQLLexer(lexer.Lexer): start_state = STATE_BASE MERGE_TOKENS = {('NAMED', 'ONLY'), ('SET', 'ATTRIBUTE')} NL = 'NL' MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST', 'RSCONST')) RE_FLAGS = re.X | re.M | re.I # Basic keywords keyword_rules = [Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val)) for val, tok in edgeql_keywords.items()] common_rules = keyword_rules + [ Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'), Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'), Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'''\#.*?$'''), Rule(token='ASSIGN', next_state=STATE_KEEP, regexp=r':='), Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'), Rule(token='??', next_state=STATE_KEEP, regexp=r'\?\?'), Rule(token='::', next_state=STATE_KEEP, regexp=r'::'), # special path operators Rule(token='.<', next_state=STATE_KEEP, regexp=r'\.<'), Rule(token='.>', next_state=STATE_KEEP, regexp=r'\.>'), Rule(token='//', next_state=STATE_KEEP, regexp=r'//'), Rule(token='++', next_state=STATE_KEEP, regexp=r'\+\+'), Rule(token='OP', next_state=STATE_KEEP, regexp=r''' (?: >= | <= | != | \?= | \?!=) '''), Rule(token='self', next_state=STATE_KEEP, regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'), Rule(token='FCONST', next_state=STATE_KEEP, regexp=r""" (?: \d+ (?:\.\d+)? (?:[eE](?:[+\-])?[0-9]+) ) | (?: \d+\.\d+) """), Rule(token='ICONST', next_state=STATE_KEEP, regexp=r'([1-9]\d* | 0)(?![0-9])'), Rule(token='BCONST', next_state=STATE_KEEP, regexp=rf''' (?: b ) (?P<BQ> ' | " ) (?: ( \\\\ | \\['"] | \n | . # we'll validate escape codes in the parser )*? ) (?P=BQ) '''), Rule(token='RSCONST', next_state=STATE_KEEP, regexp=rf''' (?: r )? (?P<RQ> (?: (?<=r) (?: ' | ") ) | (?: (?<!r) (?: {re_dquote}) ) ) (?: ( \n | . # we'll validate escape codes in the parser )*? ) (?P=RQ) '''), Rule(token='SCONST', next_state=STATE_KEEP, regexp=rf''' (?P<Q> ' | " ) (?: ( \\\\ | \\['"] | \n | . # we'll validate escape codes in the parser )*? ) (?P=Q) '''), # this rule will capture malformed strings and allow us to # provide better error messages Rule(token='BADSCONST', next_state=STATE_KEEP, regexp=rf''' [rb]? (['"] | (?: {re_dquote})) [^\n]* '''), Rule(token='BADIDENT', next_state=STATE_KEEP, regexp=r''' __[^\W\d]\w*__ | `__.*?__` '''), Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*'), Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`[^@].*?`'), Rule(token='self', next_state=STATE_KEEP, regexp=r'[\{\}$]'), ] states = { STATE_BASE: common_rules, } def __init__(self): super().__init__() # add capacity to handle a few tokens composed of 2 elements self._possible_long_token = {x[0] for x in self.MERGE_TOKENS} self._long_token_match = {x[1]: x[0] for x in self.MERGE_TOKENS} def token_from_text(self, rule_token, txt): if rule_token == 'BADSCONST': raise lexer.UnknownTokenError( f"Unterminated string {txt}", line=self.lineno, col=self.column, filename=self.filename) elif rule_token == 'BADIDENT': self.handle_error(txt) tok = super().token_from_text(rule_token, txt) if rule_token == 'self': tok = tok._replace(type=txt) elif rule_token == 'QIDENT': tok = tok._replace(type='IDENT', value=txt[1:-1]) return tok def lex(self): buffer = [] for tok in super().lex(): tok_type = tok.type if tok_type in {'WS', 'NL', 'COMMENT'}: # Strip out whitespace and comments continue elif tok_type in self._possible_long_token: # Buffer in case this is a merged token if not buffer: buffer.append(tok) else: yield from iter(buffer) buffer[:] = [tok] elif tok_type in self._long_token_match: prev_token = buffer[-1] if buffer else None if (prev_token and prev_token.type == self._long_token_match[tok_type]): tok = prev_token._replace( value=prev_token.value + ' ' + tok.value, type=prev_token.type + tok_type) buffer.pop() yield tok else: if buffer: yield from iter(buffer) buffer[:] = [] yield tok def lex_highlight(self): return super().lex()
class EdgeQLLexer(lexer.Lexer): start_state = STATE_BASE NL = 'NL' MULTILINE_TOKENS = frozenset(('SCONST', )) RE_FLAGS = re.X | re.M | re.I # Basic keywords keyword_rules = [ Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val)) for val, tok in edgeql_keywords.items() ] common_rules = keyword_rules + [ Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'), Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'), Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'''\#.*?$'''), Rule(token='TURNSTILE', next_state=STATE_KEEP, regexp=r':='), Rule(token='ARROW', next_state=STATE_KEEP, regexp=r'->'), Rule(token='??', next_state=STATE_KEEP, regexp=r'\?\?'), Rule(token='::', next_state=STATE_KEEP, regexp=r'::'), # special path operators Rule(token='.<', next_state=STATE_KEEP, regexp=r'\.<'), Rule(token='.>', next_state=STATE_KEEP, regexp=r'\.>'), Rule(token='OP', next_state=STATE_KEEP, regexp=r''' (?: >= | <= | != | \?= | \?!=) '''), # SQL ops Rule(token='self', next_state=STATE_KEEP, regexp=r'[,()\[\].@;:+\-*/%^<>=&|]'), Rule(token='FCONST', next_state=STATE_KEEP, regexp=r""" (?: \d+ (?:\.\d+)? (?:[eE](?:[+\-])?[0-9]+) ) | (?: \d+\.\d+) """), Rule(token='ICONST', next_state=STATE_KEEP, regexp=r'([1-9]\d* | 0)(?![0-9])'), Rule(token='SCONST', next_state=STATE_KEEP, regexp=rf''' (?P<Q> # capture the opening quote in group Q ( ' | " | {re_dquote} ) ) (?: (\\['"] | \n | .)*? ) (?P=Q) # match closing quote type with whatever is in Q '''), Rule(token='BADIDENT', next_state=STATE_KEEP, regexp=r''' __[^\W\d]\w*__ | `__.*?__` '''), Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[^\W\d]\w*'), Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r'`[^@].*?`'), Rule(token='self', next_state=STATE_KEEP, regexp=r'[\{\}$]'), ] states = { STATE_BASE: common_rules, } def token_from_text(self, rule_token, txt): if rule_token == 'BADIDENT': self.handle_error(txt) tok = super().token_from_text(rule_token, txt) if rule_token == 'self': tok = tok._replace(type=txt) elif rule_token == 'QIDENT': tok = tok._replace(type='IDENT', value=txt[1:-1]) return tok def lex(self): buffer = [] for tok in super().lex(): tok_type = tok.type if tok_type in {'WS', 'NL', 'COMMENT'}: # Strip out whitespace and comments continue else: if buffer: yield from iter(buffer) buffer[:] = [] yield tok def lex_highlight(self): return super().lex()
class PgSQLLexer(lexer.Lexer): start_state = STATE_BASE NL = frozenset('NL') MULTILINE_TOKENS = frozenset(('COMMENT', 'SCONST')) RE_FLAGS = re.X | re.M | re.I # Basic keywords keyword_rules = [ Rule(token='KEYWORD', next_state=STATE_KEEP, regexp=lexer.group(*pg_keywords.keys())) ] common_rules = keyword_rules + [ Rule(token='WS', next_state=STATE_KEEP, regexp=r'[^\S\n]+'), Rule(token='NL', next_state=STATE_KEEP, regexp=r'\n'), Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r''' (?:/\*(?:.|\n)*?\*/) | (?:--.*?$) '''), Rule(token='TYPECAST', next_state=STATE_KEEP, regexp=r'::'), # multichar ops (so 2+ chars) Rule(token='Op', next_state=STATE_KEEP, regexp=r''' # EdgeQL-specific multi-char ops {opchar_pg} (?:{opchar}(?!/\*|--))+ | (?:{opchar}(?!/\*|--))+ {opchar_pg} (?:{opchar}(?!/\*|--))* | # SQL-only multi-char ops cannot end in + or - (?:{opchar_sql}(?!/\*|--))+[*/^%<>=] '''.format(opchar_pg=re_opchars_pgsql, opchar=re_opchars, opchar_sql=re_opchars_sql)), # PgSQL single char ops Rule(token='Op', next_state=STATE_KEEP, regexp=re_opchars_pgsql), # SQL ops Rule(token='self', next_state=STATE_KEEP, regexp=re_self), Rule(token='FCONST', next_state=STATE_KEEP, regexp=r""" (?: \d+ (?:\.\d*)? | \. \d+ ) {exppart} """.format(exppart=re_exppart)), Rule(token='FCONST', next_state=STATE_KEEP, regexp=r''' (?: \d+\.(?!\.)\d* | \.\d+) '''), Rule(token='ICONST', next_state=STATE_KEEP, regexp=r'\d+'), Rule(token='BCONST', next_state=STATE_KEEP, regexp=r''' B'(?: [01] | '' | ' (?:\s*\n\s*) ' )*' '''), Rule(token='XCONST', next_state=STATE_KEEP, regexp=r''' X'(?: [\da-fA-F] | '' | ' (?:\s*\n\s*) ' )*' '''), # don't have extra checks for correct escaping inside Rule(token='SCONST', next_state=STATE_KEEP, regexp=r''' [nNeE]? '(?: [^'] | '' | ' (?:\s*\n\s*) ' )*' '''), # dollar quoted strings Rule(token='DQCONST', next_state=STATE_KEEP, regexp=r''' \$(?P<dq> (?:{ident_start}{ident_cont}*)? )\$ .*? \$(?P=dq)\$ '''.format(ident_start=re_ident_start, ident_cont=re_ident_cont)), # specifying custom escape character Rule(token='UESCAPE', next_state=STATE_KEEP, regexp=r"""UESCAPE\s+'[^a-fA-F\d\s+'"]'"""), # quoted identifier Rule(token='QIDENT', next_state=STATE_KEEP, regexp=r''' (?:U&)? "(?: [^"] | "" )+" '''.format(ident_start=re_ident_start, ident_cont=re_ident_cont)), Rule(token='PARAM', next_state=STATE_KEEP, regexp=r'\$\d+'), Rule(token='IDENT', next_state=STATE_KEEP, regexp=r''' {ident_start}{ident_cont}* '''.format(ident_start=re_ident_start, ident_cont=re_ident_cont)), ] states = { STATE_BASE: common_rules, } def token_from_text(self, rule_token, txt): tok = super().token_from_text(rule_token, txt) if rule_token == 'self': tok = tok._replace(type=txt) elif rule_token == 'IDENT': tok = tok._replace(value=txt.lower()) elif rule_token == 'KEYWORD': # process keywords here since having separate rules for them # creates > 100 re groups. txt_low = txt.lower() tok = tok._replace(value=txt_low, type=pg_keywords[txt_low][0]) elif rule_token in ('SCONST', 'BCONST', 'XCONST'): txt = txt[:-1].split("'", 1)[1] txt = clean_string.sub('', txt.replace("''", "'")) tok = tok._replace(value=txt) elif rule_token == 'PARAM': tok = tok._replace(value=txt[1:]) elif rule_token == 'QIDENT': tok = tok._replace(type='IDENT', value=txt[:-1].split('"', 1)[1]) elif rule_token == 'DQCONST': txt = txt.rsplit("$", 2)[2] txt = txt.split("$", 2)[2] tok = tok._replace(type='SCONST', value=txt) return tok
class GraphQLLexer(lexer.Lexer): start_state = STATE_BASE NL = 'NL' RE_FLAGS = re.X | re.M # Basic keywords keyword_rules = [ Rule(token=tok[0], next_state=STATE_KEEP, regexp=lexer.group(val)) for val, tok in graphql_keywords.items() ] common_rules = keyword_rules + [ Rule(token='NL', next_state=STATE_KEEP, regexp=r'\r\n|\n|\r'), Rule(token='WS', next_state=STATE_KEEP, regexp=r'[ \t]+'), Rule(token='COMMA', next_state=STATE_KEEP, regexp=r','), Rule(token='COMMENT', next_state=STATE_KEEP, regexp=r'\#[^\n]*$'), Rule(token='LPAREN', next_state=STATE_KEEP, regexp=r'\('), Rule(token='RPAREN', next_state=STATE_KEEP, regexp=r'\)'), Rule(token='LSBRACKET', next_state=STATE_KEEP, regexp=r'\['), Rule(token='RSBRACKET', next_state=STATE_KEEP, regexp=r'\]'), Rule(token='LCBRACKET', next_state=STATE_KEEP, regexp=r'\{'), Rule(token='RCBRACKET', next_state=STATE_KEEP, regexp=r'\}'), Rule(token='BANG', next_state=STATE_KEEP, regexp=r'\!'), Rule(token='ELLIPSIS', next_state=STATE_KEEP, regexp=r'\.\.\.'), Rule(token='COLON', next_state=STATE_KEEP, regexp=r':'), Rule(token='EQUAL', next_state=STATE_KEEP, regexp=r'='), Rule(token='AT', next_state=STATE_KEEP, regexp=r'@'), Rule(token='INTEGER', next_state=STATE_KEEP, regexp=r'-?(?:0|[1-9][0-9]*)(?![eE.0-9])'), Rule(token='FLOAT', next_state=STATE_KEEP, regexp=r''' -?(0|[1-9][0-9]*) (\.[0-9]+)? ([eE][+-]?[0-9]+)? (?![eE.0-9]) # must not be followed by a number '''), Rule(token='STRING', next_state=STATE_KEEP, regexp=r''' (?:r)?" [^\n]*? (?<!\\)" '''), Rule(token='IDENT', next_state=STATE_KEEP, regexp=r'[_A-Za-z][_0-9A-Za-z]*'), Rule(token='VAR', next_state=STATE_KEEP, regexp=r'\$[_0-9A-Za-z]+'), Rule(token='DOLLAR', next_state=STATE_KEEP, regexp=r'\$'), ] states = { STATE_BASE: list(common_rules), } def handle_error(self, txt): # check if this is unterminated string instead of a generic error if txt == '"': pos = re.compile(r'$', self.RE_FLAGS).search(self.inputstr, self.start).start() pos += self.column - self.start raise UnterminatedStringError( 'unterminated string token {position}', line=self.lineno, col=pos, filename=self.filename) super().handle_error(txt)