def compile_rules(self, context): """ Pass to turn the lexer DSL into our internal regexp objects. """ assert context.nfa_start is None regexps = RegexpCollection() # Import patterns into regexps for name, pattern, loc in self.patterns: with Context('In definition of lexer pattern {}'.format(name), loc): regexps.add_pattern(name, pattern) # Now turn each rule into a NFA nfas = [] for i, a in enumerate(self.rules): assert isinstance(a, RuleAssoc) # Check that actions never emit Termination and LexingFailure # tokens. These tokens are supposed to be emitted by the lexing # engine only. def check(token): check_source_language( token not in (self.tokens.Termination, self.tokens.LexingFailure), '{} is reserved for automatic actions only'.format( token.dsl_name)) if isinstance(a.action, Case.CaseAction): for alt in a.action.all_alts: check(alt.send) elif isinstance(a.action, Ignore): pass else: assert isinstance(a.action, TokenAction) check(a.action) with Context('In definition of lexer rules', a.location): nfa_start, nfa_end = regexps.nfa_for(a.matcher.regexp) nfas.append(nfa_start) # The first rule that was added must have precedence when multiple # rules compete for the longest match. To implement this behavior, # we associate increasing ids to each token action. nfa_end.label = (i, a.action) # Create a big OR for all possible accepted patterns context.nfa_start = NFAState() for nfa in nfas: context.nfa_start.add_transition(None, nfa)
# Ranges r'[]', r'[a]', r'[a-c]', r'[^a-c]', r'[^]', r'[a^]', r'[a-]', r'[-b]', r'[a-c-]', # Escape sequences in ranges r'[\]]', r'[\u1234]', r'[\u1234-\u1243]', ]: print('== {} =='.format(regexp)) lexer = RegexpCollection() try: parser = lexer._parse(regexp) except DiagnosticError: pass else: print(parser) print('') print('Done')