def load(self): from inclexer.inclexer import IncrementalLexerCF lexer = IncrementalLexerCF() names = [] regexs = [] for n, r in lexingrules: names.append(n) regexs.append(r) lexer.from_name_and_regex(names, regexs) parser = RubyParser() parser.init_ast() return parser, lexer
class BootstrapParser(object): def __init__(self, lr_type=1, whitespaces=False): self.lr_type = lr_type self.whitespaces = whitespaces # load (old) parser for grammar grammar self.rules = {} self.lrules = [] self.start_symbol = None self.incparser = None self.inclexer = None self.terminals = set() self.extra_alternatives = {} self.change_startrule = None self.options = {"nowhitespace": []} self.precedences = [] self.current_rulename = "" self.all_terminals = set() self.functions = [] self.prod_ids = {} def implicit_ws(self): if self.options.has_key("implicit_ws"): if self.options["implicit_ws"] == "true": return True return False def implicit_newlines(self): if self.options.has_key("implicit_newlines"): if self.options["implicit_newlines"] == "false": return False return True def indentation_based(self): if self.options.has_key("indentation"): if self.options["indentation"] == "true": return True return False def parse(self, ecogrammar): # this is only called for grammars based on Eco Grammar (not Eco Grammar (Eco)) from grammars.eco_grammar import eco_grammar as grammar self.lexer = IncrementalLexer(grammar.priorities) self.parser = IncParser(grammar.grammar, 1, True) self.parser.init_ast() self.ast = self.parser.previous_version.parent self.treemanager = TreeManager() self.treemanager.add_parser(self.parser, self.lexer, grammar.name) self.treemanager.import_file(ecogrammar) if self.parser.last_status == False: raise Exception("Invalid input grammar due to syntax errors") self.read_options() self.parse_both() self.create_parser() self.create_lexer() def parse_both(self): # parse rules startrule = self.ast.children[1] # startrule grammar = startrule.children[1] parser = grammar.children[0] assert parser.symbol.name == "parser" self.parse_rules(parser) # parse lexer startrule = self.ast.children[1] # startrule grammar = startrule.children[1] for element in grammar.children: if element.symbol.name == "lexer": break lexer = element assert lexer.symbol.name == "lexer" self.parse_lexer(lexer) for name, regex in self.lrules: # collect terminals for parser modifications self.all_terminals.add(name) def read_options(self): startrule = self.ast.children[1] # startrule assert startrule.symbol.name == "Startrule" grammar = startrule.children[1] assert grammar.symbol.name == "grammar" for element in grammar.children: if element.symbol.name == "options": break if element.symbol.name != "options": # grammar has no options print("warning: grammar has no options") # backwards compatibility if self.whitespaces: self.options["implicit_ws"] = "true" return options = element assert options.symbol.name == "options" self.parse_options(options) def parse_options(self, options): if options.children == []: return if len(options.children) > 0: assert options.children[0].symbol.name == "settings" self.parse_settings(options.children[0]) if len(options.children) > 1: assert options.children[1].symbol.name == "precedences" self.parse_precedences(options.children[1]) def parse_settings(self, options): if options.children == []: return if len(options.children) == 2: more = options.children[0] self.parse_settings(more) option = options.children[1] else: option = options.children[0] name = option.children[2].symbol.name choice = option.children[6] assert choice.symbol.name == "choice" if choice.children[0].symbol.name == "choice_list": self.options[name] = self.parse_choicelist(choice.children[0]) else: self.options[name] = choice.children[0].symbol.name def parse_choicelist(self, symbol): s = [] for c in symbol.children: if c.symbol.name == ",": continue if c.symbol.name == "WS": continue if c.lookup == "nonterminal": s.append(c.symbol.name) continue if c.symbol.name == "choice_list": rec_s = self.parse_choicelist(symbol.children[0]) s.extend(rec_s) continue return s def parse_precedences(self, precedences): if precedences.children == []: return # recursively parse other precedences if len(precedences.children) == 2: more = precedences.children[0] self.parse_precedences(more) precedence = precedences.children[1] else: precedence = precedences.children[0] # parse single precedence name = precedence.children[0].symbol.name terminals = self.parse_precedence_symbols(precedence.children[2]) self.precedences.append((name, terminals)) def parse_precedence_symbols(self, symbol): s = [] for c in symbol.children: if c.symbol.name == "WS": continue if c.symbol.name == "terminals": rec_s = self.parse_precedence_symbols(symbol.children[0]) s.extend(rec_s) if c.lookup == "terminal": s.append(c.symbol.name[1:-1]) return s def create_parser(self, pickle_id=None): self.all_terminals.update(self.terminals) for fname, terminals, parentrule in self.functions: if fname.startswith("*match_until"): if Nonterminal(fname) not in self.rules: r = Rule(Nonterminal(fname)) for t in self.all_terminals: if t not in terminals: r.add_alternative( [Nonterminal(fname), Terminal(t)], None, t) r.add_alternative([]) self.rules[r.symbol] = r # remove whitespace before special rule from parent rule, e.g. # multistring ::= "MLS" WS *match_until "MLS" WS # ^ this WS causes shift/reduce conflicts prule = self.rules[Nonterminal(parentrule)] for a in prule.alternatives: for i in range(len(a)): sym = a[i] if sym.name == "WS": if len(a) > i + 1 and a[i + 1].name.startswith( "*match_until"): a.pop(i) break if self.implicit_ws(): ws_rule = Rule() ws_rule.symbol = Nonterminal("WS") ws_rule.add_alternative([Nonterminal("WS"), Terminal("<ws>")]) # get comment rule if self.options.has_key('comment_rule'): cmt_rules = self.options['comment_rule'] for cmt_rule in cmt_rules: if Nonterminal(cmt_rule) in self.rules: ws_rule.add_alternative( [Nonterminal("WS"), Nonterminal("comment")]) if self.implicit_newlines(): ws_rule.add_alternative( [Nonterminal("WS"), Terminal("<return>")]) ws_rule.add_alternative([ Nonterminal("WS"), Terminal("<backslash>"), Terminal("<return>") ]) ws_rule.add_alternative([]) # or empty self.rules[ws_rule.symbol] = ws_rule for a in ws_rule.alternatives: self.prod_ids[Production(ws_rule.symbol, a)] = len(self.prod_ids) # allow whitespace/comments at beginning of file start_rule = Rule() start_rule.symbol = Nonterminal("Startrule") start_rule.add_alternative([Nonterminal("WS"), self.start_symbol]) self.rules[start_rule.symbol] = start_rule self.prod_ids[Production(start_rule.symbol, start_rule.alternatives[0])] = len( self.prod_ids) self.start_symbol = start_rule.symbol incparser = IncParser() incparser.from_dict(self.rules, self.start_symbol, self.lr_type, self.implicit_ws(), pickle_id, self.precedences, self.prod_ids) incparser.init_ast() self.incparser = incparser def parse_rules(self, node): if node.children[0].symbol.name == "parser": self.parse_rules(node.children[0]) self.parse_rule(node.children[3]) elif node.children[0].symbol.name == "rule": self.parse_rule(node.children[0]) def parse_rule(self, node): name = node.children[0].symbol.name self.current_rulename = name alternatives = self.parse_alternatives(node.children[4]) symbol = Nonterminal(name) if self.start_symbol is None: self.start_symbol = symbol if self.change_startrule and symbol.name == self.change_startrule: self.start_symbol = symbol r = Rule(symbol) for a in alternatives: r.add_alternative(a[0], a[1], a[2]) self.prod_ids[Production(symbol, a[0])] = len(self.prod_ids) # add additional alternatives to the grammar (grammar extension feature, e.g. languageboxes) if self.extra_alternatives.has_key(symbol.name): for n in self.extra_alternatives[symbol.name]: a = [MagicTerminal(n), Nonterminal("WS")] r.add_alternative(a) self.prod_ids[Production(symbol, a)] = len(self.prod_ids) self.rules[symbol] = r def parse_alternatives(self, node): if node.children[0].symbol.name == "alternatives": alternatives = self.parse_alternatives(node.children[0]) alternative = self.parse_alternative(node.children[3]) alternatives.append(alternative) return alternatives elif node.children[0].symbol.name == "right": return [self.parse_alternative(node.children[0])] def parse_alternative(self, node): if len(node.children) > 0: annotation = None prec = None for c in node.children: if c.symbol.name == "symbols": symbols = self.parse_symbols(c) if c.symbol.name == "prec": prec = self.parse_prec(c) if c.symbol.name == "annotations": annotation = self.parse_annotation(c) return (symbols, annotation, prec) else: return ([], None, None) def parse_prec(self, node): if node.children: c = node.children[2] return c.symbol.name[1:-1] def parse_symbols(self, node): if node.children[0].symbol.name == "symbols": symbols = self.parse_symbols(node.children[0]) symbol = self.parse_symbol(node.children[1]) symbols.append(symbol) if ( isinstance(symbol, Terminal) or isinstance(symbol, MagicTerminal) ) and self.implicit_ws( ) and self.current_rulename not in self.options["nowhitespace"]: symbols.append(Nonterminal("WS")) return symbols elif node.children[0].symbol.name == "symbol": l = [] symbol = self.parse_symbol(node.children[0]) l.append(symbol) if isinstance(symbol, Terminal) and self.implicit_ws( ) and self.current_rulename not in self.options["nowhitespace"]: l.append(Nonterminal("WS")) return l def parse_symbol(self, node): node = node.children[0] if node.lookup == "nonterminal": return Nonterminal(node.symbol.name) elif node.lookup == "terminal": if node.symbol.name != "\"<eos>\"": self.terminals.add(node.symbol.name[1:-1]) return Terminal(node.symbol.name[1:-1]) elif node.lookup == "languagebox": return MagicTerminal(node.symbol.name) elif node.symbol.name == "function": return self.parse_function(node) def parse_function(self, node): fname = node.children[0].symbol.name terminals = self.parse_fargs(node.children[4]) safe_name = "*%s%s" % (fname, hash(frozenset(terminals))) self.functions.append((safe_name, terminals, self.current_rulename)) return Nonterminal(safe_name) def parse_fargs(self, symbol): s = [] for c in symbol.children: if c.symbol.name == ",": continue if c.symbol.name == "WS": continue if c.lookup == "terminal": s.append(c.symbol.name[1:-1]) continue if c.symbol.name == "f_args": rec_s = self.parse_fargs(symbol.children[0]) s.extend(rec_s) return s def parse_annotation(self, node): a_options = node.children[2] assert a_options.symbol.name == "a_options" if a_options.children[0].symbol.name == "astnode": return self.parse_astnode(a_options.children[0]) elif a_options.children[0].symbol.name == "expression": return self.parse_expression(a_options.children[0]) elif a_options.children[0].symbol.name == "forloop": return self.parse_foreach(a_options.children[0]) def parse_astnode(self, node): name = node.children[0].symbol.name children = self.parse_astnode_children(node.children[4]) d = {} for n, expr in children: d[n] = expr return AstNode(name, d) def parse_astnode_children(self, node): assert node.symbol.name == "astnode_children" if node.children[0].symbol.name == "astnode_child": return [self.parse_astnode_child(node.children[0])] elif node.children[0].symbol.name == "astnode_children": children = self.parse_astnode_children(node.children[0]) child = self.parse_astnode_child(node.children[3]) children.append(child) return children def parse_astnode_child(self, node): assert node.symbol.name == "astnode_child" name = node.children[0].symbol.name if node.children[4].symbol.name == "expression": expr = self.parse_expression(node.children[4]) elif node.children[4].symbol.name == "reference": expr = self.parse_reference(node.children[4]) return (name, expr) def parse_expression(self, node): if node.children[0].symbol.name == "node": return self.parse_node(node.children[0]) elif node.children[0].symbol.name == "list": return self.parse_list(node.children[0]) elif node.children[0].symbol.name == "node_ref": return self.parse_noderef(node.children[0]) else: expr1 = self.parse_expression(node.children[0]) if node.children[3].symbol.name == "node": expr2 = self.parse_node(node.children[3]) else: expr2 = self.parse_list(node.children[3]) return AddExpr(expr1, expr2) def parse_foreach(self, node): item = self.parse_node(node.children[4]) expr = self.parse_astnode(node.children[7]) return Foreach(node.symbol.name, item, expr) def parse_noderef(self, node): lookup = self.parse_node(node.children[0]) attr = node.children[3] lookup.attribute = attr.symbol.name return lookup def parse_node(self, node): return LookupExpr(int(node.children[2].symbol.name)) def parse_list(self, node): return ListExpr(self.parse_listloop(node.children[2])) def parse_reference(self, node): base = node.children[0].symbol.name ref = node.children[4].symbol.name return ReferenceExpr(base, ref) def parse_listloop(self, node): if len(node.children) == 0: return [] if node.children[0].symbol.name == "list_loop": l = self.parse_listloop(node.children[0]) element = self.parse_unknown(node.children[3]) l.append(element) return l else: return [self.parse_unknown(node.children[0])] def parse_unknown(self, node): if node.symbol.name == "node": return self.parse_node(node) elif node.symbol.name == "astnode": return self.parse_astnode(node) def create_lexer(self, buildlexer=True): names = [] regexs = [] for name, regex in self.lrules: names.append(name) self.all_terminals.add(name) regexs.append(regex) # add so far undefined terminals undefined_terminals = self.terminals.difference(set(names)) import re for t in undefined_terminals: names.insert(0, t) regexs.insert(0, re.escape(t)) if not buildlexer: self.inclexer = (names, regexs) return self.inclexer = IncrementalLexerCF() self.inclexer.from_name_and_regex(names, regexs) if self.indentation_based(): self.inclexer.indentation_based = True def parse_lexer(self, lexer): if lexer.children[0].symbol.name == "lrule": self.parse_lrule(lexer.children[0]) elif lexer.children[0].symbol.name == "lexer": self.parse_lexer(lexer.children[0]) self.parse_lrule(lexer.children[1]) def parse_lrule(self, lrule): assert lrule.children[0].symbol.name == "tokenname" name = lrule.children[0].children[0].symbol.name regex = lrule.children[3].symbol.name[1:-1] self.lrules.append((name, regex))
class BootstrapParser(object): def __init__(self, lr_type=1, whitespaces=False): self.lr_type = lr_type self.whitespaces = whitespaces # load (old) parser for grammar grammar self.rules = {} self.lrules = [] self.start_symbol = None self.incparser = None self.inclexer = None self.terminals = set() self.extra_alternatives = {} self.change_startrule = None self.options = {} self.precedences = [] def implicit_ws(self): if self.options.has_key("implicit_ws"): if self.options["implicit_ws"] == "true": return True return False def indentation_based(self): if self.options.has_key("indentation"): if self.options["indentation"] == "true": return True return False def parse(self, ecogrammar): # this is only called for grammars based on Eco Grammar (not Eco Grammar (Eco)) from grammars.eco_grammar import eco_grammar as grammar self.lexer = IncrementalLexer(grammar.priorities) self.parser = IncParser(grammar.grammar, 1, True) self.parser.init_ast() self.ast = self.parser.previous_version.parent self.treemanager = TreeManager() self.treemanager.add_parser(self.parser, self.lexer, grammar.name) self.treemanager.import_file(ecogrammar) if self.parser.last_status == False: raise Exception("Invalid input grammar: at %s %s" % (self.parser.error_node.prev_term, self.parser.error_node)) self.read_options() self.create_parser() self.create_lexer() def read_options(self): startrule = self.ast.children[1] # startrule assert startrule.symbol.name == "Startrule" grammar = startrule.children[1] assert grammar.symbol.name == "grammar" for element in grammar.children: if element.symbol.name == "options": break if element.symbol.name != "options": # grammar has no options print("warning: grammar has no options") # backwards compatibility if self.whitespaces: self.options["implicit_ws"] = "true" return options = element assert options.symbol.name == "options" self.parse_options(options) def parse_options(self, options): if options.children == []: return if len(options.children) > 0: assert options.children[0].symbol.name == "settings" self.parse_settings(options.children[0]) if len(options.children) > 1: assert options.children[1].symbol.name == "precedences" self.parse_precedences(options.children[1]) def parse_settings(self, options): if options.children == []: return if len(options.children) == 2: more = options.children[0] self.parse_settings(more) option = options.children[1] else: option = options.children[0] name = option.children[2].symbol.name choice = option.children[6] assert choice.symbol.name == "choice" self.options[name] = choice.children[0].symbol.name def parse_precedences(self, precedences): if precedences.children == []: return # recursively parse other precedences if len(precedences.children) == 2: more = precedences.children[0] self.parse_precedences(more) precedence = precedences.children[1] else: precedence = precedences.children[0] # parse single precedence name = precedence.children[0].symbol.name terminals = self.parse_precedence_symbols(precedence.children[2]) self.precedences.append((name, terminals)) def parse_precedence_symbols(self, symbol): s = [] for c in symbol.children: if c.symbol.name == "WS": continue if c.symbol.name == "terminals": rec_s = self.parse_precedence_symbols(symbol.children[0]) s.extend(rec_s) if c.lookup == "terminal": s.append(c.symbol.name[1:-1]) return s def create_parser(self, pickle_id = None): startrule = self.ast.children[1] # startrule grammar = startrule.children[1] parser = grammar.children[0] assert parser.symbol.name == "parser" self.parse_rules(parser) if self.implicit_ws(): ws_rule = Rule() ws_rule.symbol = Nonterminal("WS") ws_rule.add_alternative([Terminal("<ws>"), Nonterminal("WS")]) ws_rule.add_alternative([Terminal("<return>"), Nonterminal("WS")]) ws_rule.add_alternative([Terminal("<backslash>"), Terminal("<return>"), Nonterminal("WS")]) ws_rule.add_alternative([]) # or empty self.rules[ws_rule.symbol] = ws_rule # allow whitespace/comments at beginning of file start_rule = Rule() start_rule.symbol = Nonterminal("Startrule") start_rule.add_alternative([Nonterminal("WS"), self.start_symbol]) self.rules[start_rule.symbol] = start_rule self.start_symbol = start_rule.symbol incparser = IncParser() incparser.from_dict(self.rules, self.start_symbol, self.lr_type, self.implicit_ws(), pickle_id, self.precedences) incparser.init_ast() self.incparser = incparser def parse_rules(self, node): if node.children[0].symbol.name == "parser": self.parse_rules(node.children[0]) self.parse_rule(node.children[3]) elif node.children[0].symbol.name == "rule": self.parse_rule(node.children[0]) def parse_rule(self, node): name = node.children[0].symbol.name alternatives = self.parse_alternatives(node.children[4]) symbol = Nonterminal(name) if self.start_symbol is None: self.start_symbol = symbol if self.change_startrule and symbol.name == self.change_startrule: self.start_symbol = symbol r = Rule(symbol) for a in alternatives: r.add_alternative(a[0], a[1], a[2]) # add additional alternatives to the grammar (grammar extension feature, e.g. languageboxes) if self.extra_alternatives.has_key(symbol.name): for n in self.extra_alternatives[symbol.name]: r.add_alternative([MagicTerminal(n), Nonterminal("WS")], None) self.rules[symbol] = r def parse_alternatives(self, node): if node.children[0].symbol.name == "alternatives": alternatives = self.parse_alternatives(node.children[0]) alternative = self.parse_alternative(node.children[3]) alternatives.append(alternative) return alternatives elif node.children[0].symbol.name == "right": return [self.parse_alternative(node.children[0])] def parse_alternative(self, node): if len(node.children) > 0: annotation = None prec = None for c in node.children: if c.symbol.name == "symbols": symbols = self.parse_symbols(c) if c.symbol.name == "prec": prec = self.parse_prec(c) if c.symbol.name == "annotations": annotation = self.parse_annotation(c) return (symbols, annotation, prec) else: return ([], None, None) def parse_prec(self, node): if node.children: c = node.children[2] return c.symbol.name[1:-1] def parse_symbols(self, node): if node.children[0].symbol.name == "symbols": symbols = self.parse_symbols(node.children[0]) symbol = self.parse_symbol(node.children[1]) symbols.append(symbol) if (isinstance(symbol, Terminal) or isinstance(symbol, MagicTerminal)) and self.implicit_ws(): symbols.append(Nonterminal("WS")) return symbols elif node.children[0].symbol.name == "symbol": l = [] symbol = self.parse_symbol(node.children[0]) l.append(symbol) if isinstance(symbol, Terminal) and self.implicit_ws(): l.append(Nonterminal("WS")) return l def parse_symbol(self, node): node = node.children[0] if node.lookup == "nonterminal": return Nonterminal(node.symbol.name) elif node.lookup == "terminal": self.terminals.add(node.symbol.name[1:-1]) return Terminal(node.symbol.name[1:-1]) elif node.lookup == "languagebox": return MagicTerminal(node.symbol.name) def parse_annotation(self, node): a_options = node.children[2] assert a_options.symbol.name == "a_options" if a_options.children[0].symbol.name == "astnode": return self.parse_astnode(a_options.children[0]) elif a_options.children[0].symbol.name == "expression": return self.parse_expression(a_options.children[0]) elif a_options.children[0].symbol.name == "forloop": return self.parse_foreach(a_options.children[0]) def parse_astnode(self, node): name = node.children[0].symbol.name children = self.parse_astnode_children(node.children[4]) d = {} for n, expr in children: d[n] = expr return AstNode(name, d) def parse_astnode_children(self, node): assert node.symbol.name == "astnode_children" if node.children[0].symbol.name == "astnode_child": return [self.parse_astnode_child(node.children[0])] elif node.children[0].symbol.name == "astnode_children": children = self.parse_astnode_children(node.children[0]) child = self.parse_astnode_child(node.children[3]) children.append(child) return children def parse_astnode_child(self, node): assert node.symbol.name == "astnode_child" name = node.children[0].symbol.name if node.children[4].symbol.name == "expression": expr = self.parse_expression(node.children[4]) elif node.children[4].symbol.name == "reference": expr = self.parse_reference(node.children[4]) return (name, expr) def parse_expression(self, node): if node.children[0].symbol.name == "node": return self.parse_node(node.children[0]) elif node.children[0].symbol.name == "list": return self.parse_list(node.children[0]) elif node.children[0].symbol.name == "node_ref": return self.parse_noderef(node.children[0]) else: expr1 = self.parse_expression(node.children[0]) if node.children[3].symbol.name == "node": expr2 = self.parse_node(node.children[3]) else: expr2 = self.parse_list(node.children[3]) return AddExpr(expr1, expr2) def parse_foreach(self, node): item = self.parse_node(node.children[4]) expr = self.parse_astnode(node.children[7]) return Foreach(node.symbol.name, item, expr) def parse_noderef(self, node): lookup = self.parse_node(node.children[0]) attr = node.children[3] lookup.attribute = attr.symbol.name return lookup def parse_node(self, node): return LookupExpr(int(node.children[2].symbol.name)) def parse_list(self, node): return ListExpr(self.parse_listloop(node.children[2])) def parse_reference(self, node): base = node.children[0].symbol.name ref = node.children[4].symbol.name return ReferenceExpr(base, ref) def parse_listloop(self, node): if len(node.children) == 0: return [] if node.children[0].symbol.name == "list_loop": l = self.parse_listloop(node.children[0]) element = self.parse_unknown(node.children[3]) l.append(element) return l else: return [self.parse_unknown(node.children[0])] def parse_unknown(self, node): if node.symbol.name == "node": return self.parse_node(node) elif node.symbol.name == "astnode": return self.parse_astnode(node) def create_lexer(self): startrule = self.ast.children[1] # startrule grammar = startrule.children[1] for element in grammar.children: if element.symbol.name == "lexer": break lexer = element assert lexer.symbol.name == "lexer" self.parse_lexer(lexer) names = [] regexs = [] for name, regex in self.lrules: names.append(name) regexs.append(regex) # add so far undefined terminals undefined_terminals = self.terminals.difference(set(names)) import re for t in undefined_terminals: names.insert(0, t) regexs.insert(0,re.escape(t)) self.inclexer = IncrementalLexerCF() self.inclexer.from_name_and_regex(names, regexs) if self.indentation_based(): self.inclexer.indentation_based = True def parse_lexer(self, lexer): if lexer.children[0].symbol.name == "lrule": self.parse_lrule(lexer.children[0]) elif lexer.children[0].symbol.name == "lexer": self.parse_lexer(lexer.children[0]) self.parse_lrule(lexer.children[1]) def parse_lrule(self, lrule): assert lrule.children[0].symbol.name == "tokenname" name = lrule.children[0].children[0].symbol.name regex = lrule.children[3].symbol.name[1:-1] self.lrules.append((name, regex))