class Token(LexerToken): Def = WithText() Var = WithText() Error = WithText() Example = WithText() Null = WithText() Comma = WithText() Dot = WithText() Semicolon = WithText() LPar = WithText() RPar = WithText() LBrace = WithText() RBrace = WithText() Equal = WithText() Plus = WithText() Minus = WithText() LessThan = WithText() Number = WithText() Identifier = WithSymbol() String = WithText() Comment = WithTrivia() Whitespace = WithTrivia() Alphanumericals = TokenFamily(Def, Error, Example, Null, Number, Identifier) Punctuation = TokenFamily(Comma, Dot, Semicolon, LPar, RPar, LBrace, RBrace, Equal, Plus) Comments = TokenFamily(Comment)
class Token(LexerToken): Identifier = WithSymbol() All = WithSymbol() # Ada Keywords Abstract = WithSymbol() At = WithSymbol() Case = WithSymbol() End = WithSymbol() For = WithSymbol() Is = WithSymbol() Limited = WithSymbol() Null = WithSymbol() Others = WithSymbol() Package = WithSymbol() Renames = WithSymbol() Type = WithSymbol() Use = WithSymbol() When = WithSymbol() With = WithSymbol() # GPR Keywords Project = WithSymbol() Extends = WithSymbol() # Punctuation ParOpen = WithSymbol() ParClose = WithSymbol() Semicolon = WithSymbol() Colon = WithSymbol() Comma = WithSymbol() Dot = WithSymbol() Amp = WithSymbol() Tick = WithSymbol() Pipe = WithSymbol() Assign = WithSymbol() Arrow = WithSymbol() # Literals String = WithText() Number = WithText() # Hidden framework dependencies??? Label = WithText() Char = WithSymbol() Comment = WithTrivia() Whitespace = WithTrivia() Alphanumericals = TokenFamily(Identifier, All, Abstract, At, Case, End, For, Is, Limited, Null, Others, Package, Renames, Type, Use, When, With, Project, Extends, String, Number, Label, Char)
class Token(LexerToken): Identifier = WithSymbol() All = WithText() # Keywords Abort = WithText() Else = WithText() New = WithText() Return = WithText() Abs = WithText() Elsif = WithText() Not = WithText() Reverse = WithText() End = WithText() Null = WithSymbol() Accept = WithText() Entry = WithText() Select = WithText() Access = WithText() Exception = WithText() Of = WithText() Separate = WithText() Exit = WithText() Or = WithText() Others = WithText() Subtype = WithText() And = WithText() For = WithText() Out = WithText() Array = WithText() Function = WithText() At = WithText() Generic = WithText() Package = WithText() Task = WithText() Begin = WithText() Goto = WithText() Pragma = WithText() Terminate = WithText() Body = WithText() Private = WithText() Then = WithText() If = WithText() Procedure = WithText() Type = WithText() Case = WithText() In = WithText() Constant = WithText() Is = WithText() Raise = WithText() Use = WithText() Declare = WithText() Range = WithText() Delay = WithText() Limited = WithText() Record = WithText() When = WithText() Delta = WithText() Loop = WithText() Rem = WithText() While = WithText() Digits = WithText() Renames = WithText() Do = WithText() Mod = WithText() Xor = WithText() # Punctuation ParClose = WithText() ParOpen = WithText() Semicolon = WithText() Colon = WithText() Comma = WithText() Doubledot = WithText() Dot = WithText() Diamond = WithText() Lte = WithText() Gte = WithText() Arrow = WithText() Equal = WithText() Lt = WithText() Gt = WithText() Plus = WithText() Minus = WithText() Power = WithText() Mult = WithText() Amp = WithText() Notequal = WithText() Divide = WithText() Tick = WithText() Pipe = WithText() Assign = WithText() LabelStart = WithText() LabelEnd = WithText() Target = WithText() # String and char literals String = WithText() Char = WithSymbol() With = WithText() Decimal = WithText() Integer = WithText() # Trivia Comment = WithTrivia() PrepLine = WithTrivia() Whitespace = WithTrivia() Alphanumericals = TokenFamily( Identifier, All, Abort, Else, New, Return, Abs, Elsif, Not, Reverse, End, Null, Accept, Entry, Select, Access, Exception, Of, Separate, Exit, Or, Others, Subtype, And, For, Out, Array, Function, At, Generic, Package, Task, Begin, Goto, Pragma, Terminate, Body, Private, Then, If, Procedure, Type, Case, In, Constant, Is, Raise, Use, Declare, Range, Delay, Limited, Record, When, Delta, Loop, Rem, While, Digits, Renames, Do, Mod, Xor, With, Decimal, Integer)
class PresentTwice(BaseToken): Alphanumericals = TokenFamily(BaseToken.Number, BaseToken.Identifier) Numbers = TokenFamily(BaseToken.Number)
class InvalidToken(BaseToken): Alphanumericals = TokenFamily(BaseToken.Number, BaseToken.Identifier, 'foobar')
def create_lexer(ctx, lkt_units): """ Create and populate a lexer from a Lktlang unit. :param list[liblktlang.AnalysisUnit] lkt_units: Non-empty list of analysis units where to look for the grammar. :rtype: langkit.lexer.Lexer """ import liblktlang # Look for the LexerDecl node in top-level lists full_lexer = find_toplevel_decl(ctx, lkt_units, liblktlang.LexerDecl, 'lexer') with ctx.lkt_context(full_lexer): lexer_annot = parse_annotations(ctx, lexer_annotations, full_lexer) patterns = {} """ Mapping from pattern names to the corresponding regular expression. :type: dict[names.Name, str] """ token_family_sets = {} """ Mapping from token family names to the corresponding sets of tokens that belong to this family. :type: dict[names.Name, Token] """ token_families = {} """ Mapping from token family names to the corresponding token families. We build this late, once we know all tokens and all families. :type: dict[names.Name, TokenFamily] """ tokens = {} """ Mapping from token names to the corresponding tokens. :type: dict[names.Name, Token] """ rules = [] pre_rules = [] """ Lists of regular and pre lexing rules for this lexer. :type: list[(langkit.lexer.Matcher, langkit.lexer.Action)] """ newline_after = [] """ List of tokens after which we must introduce a newline during unparsing. :type: list[Token] """ def ignore_constructor(start_ignore_layout, end_ignore_layout): """ Adapter to build a Ignore instance with the same API as WithText constructors. """ del start_ignore_layout, end_ignore_layout return Ignore() def process_family(f): """ Process a LexerFamilyDecl node. Register the token family and process the rules it contains. :type f: liblktlang.LexerFamilyDecl """ with ctx.lkt_context(f): # Create the token family, if needed name = names.Name.from_lower(text_as_str(f.f_syn_name)) token_set = token_family_sets.setdefault(name, set()) for r in f.f_rules: check_source_language( isinstance(r.f_decl, liblktlang.GrammarRuleDecl), 'Only lexer rules allowed in family blocks') process_token_rule(r, token_set) def process_token_rule(r, token_set=None): """ Process the full declaration of a GrammarRuleDecl node: create the token it declares and lower the optional associated lexing rule. :param liblktlang.FullDecl r: Full declaration for the GrammarRuleDecl to process. :param None|set[TokenAction] token_set: If this declaration appears in the context of a token family, this adds the new token to this set. Must be left to None otherwise. """ with ctx.lkt_context(r): rule_annot = parse_annotations(ctx, token_annotations, r) # Gather token action info from the annotations. If absent, # fallback to WithText. token_cons = None start_ignore_layout = False end_ignore_layout = False if 'ignore' in rule_annot: token_cons = ignore_constructor for name in ('text', 'trivia', 'symbol'): try: start_ignore_layout, end_ignore_layout = rule_annot[name] except KeyError: continue check_source_language(token_cons is None, 'At most one token action allowed') token_cons = token_cls_map[name] is_pre = rule_annot.get('pre_rule', False) if token_cons is None: token_cons = WithText # Create the token and register it where needed: the global token # mapping, its token family (if any) and the "newline_after" group # if the corresponding annotation is present. token_lower_name = text_as_str(r.f_decl.f_syn_name) token_name = names.Name.from_lower(token_lower_name) check_source_language( token_lower_name not in ('termination', 'lexing_failure'), '{} is a reserved token name'.format(token_lower_name)) check_source_language(token_name not in tokens, 'Duplicate token name') token = token_cons(start_ignore_layout, end_ignore_layout) tokens[token_name] = token if token_set is not None: token_set.add(token) if 'newline_after' in rule_annot: newline_after.append(token) # Lower the lexing rule, if present matcher_expr = r.f_decl.f_expr if matcher_expr is not None: rule = (lower_matcher(matcher_expr), token) if is_pre: pre_rules.append(rule) else: rules.append(rule) def process_pattern(full_decl): """ Process a pattern declaration. :param liblktlang.FullDecl r: Full declaration for the ValDecl to process. """ parse_annotations(ctx, [], full_decl) decl = full_decl.f_decl lower_name = text_as_str(decl.f_syn_name) name = names.Name.from_lower(lower_name) with ctx.lkt_context(decl): check_source_language(name not in patterns, 'Duplicate pattern name') check_source_language( decl.f_decl_type is None, 'Patterns must have automatic types in' ' lexers') check_source_language( isinstance(decl.f_val, liblktlang.StringLit) and decl.f_val.p_is_regexp_literal, 'Pattern string literal expected') # TODO: use StringLit.p_denoted_value when properly implemented patterns[name] = pattern_as_str(decl.f_val) def lower_matcher(expr): """ Lower a token matcher to our internals. :type expr: liblktlang.GrammarExpr :rtype: langkit.lexer.Matcher """ with ctx.lkt_context(expr): if isinstance(expr, liblktlang.TokenLit): return Literal(json.loads(text_as_str(expr))) elif isinstance(expr, liblktlang.TokenNoCaseLit): return NoCaseLit(json.loads(text_as_str(expr))) elif isinstance(expr, liblktlang.TokenPatternLit): return Pattern(pattern_as_str(expr)) else: check_source_language(False, 'Invalid lexing expression') def lower_token_ref(ref): """ Return the Token that `ref` refers to. :type ref: liblktlang.RefId :rtype: Token """ with ctx.lkt_context(ref): token_name = names.Name.from_lower(text_as_str(ref)) check_source_language(token_name in tokens, 'Unknown token: {}'.format(token_name.lower)) return tokens[token_name] def lower_family_ref(ref): """ Return the TokenFamily that `ref` refers to. :type ref: liblktlang.RefId :rtype: TokenFamily """ with ctx.lkt_context(ref): name_lower = text_as_str(ref) name = names.Name.from_lower(name_lower) check_source_language( name in token_families, 'Unknown token family: {}'.format(name_lower)) return token_families[name] def lower_case_alt(alt): """ Lower the alternative of a case lexing rule. :type alt: liblktlang.BaseLexerCaseRuleAlt :rtype: Alt """ prev_token_cond = None if isinstance(alt, liblktlang.LexerCaseRuleCondAlt): prev_token_cond = [ lower_token_ref(ref) for ref in alt.f_cond_exprs ] return Alt(prev_token_cond=prev_token_cond, send=lower_token_ref(alt.f_send.f_sent), match_size=int(alt.f_send.f_match_size.text)) # Go through all rules to register tokens, their token families and lexing # rules. for full_decl in full_lexer.f_decl.f_rules: with ctx.lkt_context(full_decl): if isinstance(full_decl, liblktlang.LexerFamilyDecl): # This is a family block: go through all declarations inside it process_family(full_decl) elif isinstance(full_decl, liblktlang.FullDecl): # There can be various types of declarations in lexers... decl = full_decl.f_decl if isinstance(decl, liblktlang.GrammarRuleDecl): # Here, we have a token declaration, potentially associated # with a lexing rule. process_token_rule(full_decl) elif isinstance(decl, liblktlang.ValDecl): # This is the declaration of a pattern process_pattern(full_decl) else: check_source_language(False, 'Unexpected declaration in lexer') elif isinstance(full_decl, liblktlang.LexerCaseRule): syn_alts = list(full_decl.f_alts) # This is a rule for conditional lexing: lower its matcher and # its alternative rules. matcher = lower_matcher(full_decl.f_expr) check_source_language( len(syn_alts) == 2 and isinstance( syn_alts[0], liblktlang.LexerCaseRuleCondAlt) and isinstance(syn_alts[1], liblktlang.LexerCaseRuleDefaultAlt), 'Invalid case rule topology') rules.append( Case(matcher, lower_case_alt(syn_alts[0]), lower_case_alt(syn_alts[1]))) else: # The grammar should make the following dead code assert False, 'Invalid lexer rule: {}'.format(full_decl) # Create the LexerToken subclass to define all tokens and token families items = {} for name, token in tokens.items(): items[name.camel] = token for name, token_set in token_family_sets.items(): tf = TokenFamily(*list(token_set)) token_families[name] = tf items[name.camel] = tf token_class = type('Token', (LexerToken, ), items) # Create the Lexer instance and register all patterns and lexing rules result = Lexer(token_class, 'track_indent' in lexer_annot, pre_rules) for name, regexp in patterns.items(): result.add_patterns((name.lower, regexp)) result.add_rules(*rules) # Register spacing/newline rules for tf1, tf2 in lexer_annot.get('spacing', []): result.add_spacing((lower_family_ref(tf1), lower_family_ref(tf2))) result.add_newline_after(*newline_after) return result