def createDFA(self, rules): # lex lexing rules pl = PriorityLexer(rules) rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority # create lexer automaton from rules regexs = [] names = [] for k, _ in rules: regex = k name = pl.rules[k][1] r = parse_regex(regex) regexs.append(r) names.append(name) self.lexer = Lexer(regexs, names)
def setup_class(cls): from rpython.rlib.parsing.parsing import PackratParser regexs, rules, ToAST = parse_ebnf(grammar) cls.ToAST = ToAST() cls.parser = PackratParser(rules, rules[0].nonterminal) cls.regexs = regexs names, regexs = zip(*regexs) cls.lexer = Lexer(list(regexs), list(names))
def make_lexer(): return Lexer([parse_regex(globals()[r]) for r in tokens], tokens[:])
def from_name_and_regex(self, names, regexs): parsed_regexs = [] for regex in regexs: r = parse_regex(regex) parsed_regexs.append(r) self.lexer = Lexer(parsed_regexs, names)
class IncrementalLexerCF(object): def __init__(self, rules=None, language=""): self.indentation_based = False if rules: if rules.startswith("%"): config_line = rules.splitlines()[0] # get first line self.parse_config(config_line[1:]) # remove % rules = "\n".join(rules.splitlines()[1:]) # remove config line self.createDFA(rules) def parse_config(self, config): settings = config.split(",") for s in settings: name, value = s.split("=") if name == "indentation" and value == "true": self.indentation_based = True def from_name_and_regex(self, names, regexs): parsed_regexs = [] for regex in regexs: r = parse_regex(regex) parsed_regexs.append(r) self.lexer = Lexer(parsed_regexs, names) def createDFA(self, rules): # lex lexing rules pl = PriorityLexer(rules) rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority # create lexer automaton from rules regexs = [] names = [] for k, _ in rules: regex = k name = pl.rules[k][1] r = parse_regex(regex) regexs.append(r) names.append(name) self.lexer = Lexer(regexs, names) def is_indentation_based(self): return self.indentation_based def lex(self, text): tokens = self.lexer.tokenize(text) return self.reformat_tokens(tokens) def reformat_tokens(self, tokens): l = [] for t in tokens: l.append((t.source, t.name)) return l def relex_import(self, startnode, version = 0): success = self.lex(startnode.symbol.name) bos = startnode.prev_term # bos startnode.parent.remove_child(startnode) parent = bos.parent eos = parent.children.pop() last_node = bos for match in success: node = TextNode(Terminal(match[0])) node.version = version node.lookup = match[1] parent.children.append(node) last_node.next_term = node last_node.right = node node.left = last_node node.prev_term = last_node node.parent = parent last_node = node parent.children.append(eos) last_node.right = eos # link to eos last_node.next_term = eos eos.left = last_node eos.prev_term = last_node def split_endcomment(self, node): read_nodes = [node] generated_tokens = [] l = node.symbol.name.split("*/", 1) t1 = self.lexer.tokenize(l[0]) generated_tokens.extend(t1) t2 = self.lexer.tokenize("*/") generated_tokens.extend(t2) if l[1] != "": t3 = self.lexer.tokenize(l[1]) generated_tokens.extend(t3) self.merge_back(read_nodes, generated_tokens) def relex(self, node): # find farthest node that has lookahead into node # start munching tokens and spit out nodes # if generated node already exists => stop # (only if we passed edited node) # find node to start relaxing startnode = node nodes = self.find_preceeding_nodes(node) if nodes: node = nodes[0] if node is startnode: past_startnode = True else: past_startnode = False if isinstance(node, EOS): # nothing to do here return False # relex read_nodes = [] generated_tokens = [] pos = 0 # read tokens read = 0 # generated tokens current_node = node next_token = self.lexer.get_token_iter(StringWrapper(node)) while True: token = next_token() if token.source == "": read_nodes.append(current_node) break read += len(token.source) # special case when inserting a newline into a string, the lexer # creates a single token. We need to make sure that that newline # gets lexed into its own token if len(token.source) > 1 and token.source.find("\r") >= 0: l = token.source.split("\r") for e in l: t = self.lexer.tokenize(e) generated_tokens.extend(t) if e is not l[-1]: newline = self.lexer.tokenize("\r") generated_tokens.extend(newline) else: generated_tokens.append(token) while read > pos + len(current_node.symbol.name): pos += len(current_node.symbol.name) read_nodes.append(current_node) current_node = current_node.next_term if current_node is startnode: past_startnode = True if past_startnode and read == pos + len(current_node.symbol.name): read_nodes.append(current_node) break return self.merge_back(read_nodes, generated_tokens) def merge_back(self, read_nodes, generated_tokens): any_changes = False # insert new nodes into tree it = iter(read_nodes) for t in generated_tokens: try: node = it.next() except StopIteration: node = TextNode(Terminal("")) last_node.insert_after(node) any_changes = True last_node = node node.symbol.name = t.source node.indent = None if node.lookup != t.name: node.mark_changed() any_changes = True else: node.mark_version() # we need to invalidate the newline if we changed whitespace or # logical nodes that come after it if node.lookup == "<ws>" or node.lookup != t.name: prev = node.prev_term while isinstance(prev.symbol, IndentationTerminal): prev = prev.prev_term if prev.lookup == "<return>": prev.mark_changed() any_changes = True elif isinstance(prev, BOS): # if there is no return, re-indentation won't be triggered # in the incremental parser so we have to mark the next # terminal. possibly only use case: bos <ws> pass DEDENT eos node.next_term.mark_changed() # XXX this should become neccessary with incparse optimisations turned on if node.lookup == "\\" and node.next_term.lookup == "<return>": node.next_term.mark_changed() any_changes = True node.lookup = t.name node.lookahead = t.lookahead # delete left over nodes while True: try: node = it.next() node.parent.remove_child(node) any_changes = True except StopIteration: break return any_changes def find_preceeding_nodes(self, node): chars = 0 nodes = [] if node.symbol.name == "\r": # if at line beginning there are no previous nodes to consider return nodes while True: node = node.prev_term if node.lookahead and node.lookahead > chars: nodes.insert(0, node) chars += len(node.symbol.name) else: break return nodes
class IncrementalLexerCF(object): """ Incremental lexer that works in accordance with the lexer described in: Wagner, Tim A. Practical algorithms for incremental software development environments. Diss. University of California, Berkeley, 1997. This incremental parser works by relexing the piece of the code that may have been affected. These are all tokens that have a look ahead into the altered node (often just one), the token itself until the changes merge. @TODO make more clear @TODO the merge should use zip """ def __init__(self, rules=None, language=""): self.indentation_based = False if rules: if rules.startswith("%"): config_line = rules.splitlines()[0] # get first line self.parse_config(config_line[1:]) # remove % rules = "\n".join(rules.splitlines()[1:]) # remove config line self.createDFA(rules) def parse_config(self, config): settings = config.split(",") for s in settings: name, value = s.split("=") if name == "indentation" and value == "true": self.indentation_based = True def from_name_and_regex(self, names, regexs): parsed_regexs = [] for regex in regexs: r = parse_regex(regex) parsed_regexs.append(r) self.lexer = Lexer(parsed_regexs, names) def createDFA(self, rules): # lex lexing rules pl = PriorityLexer(rules) rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority # create lexer automaton from rules regexs = [] names = [] for k, _ in rules: regex = k name = pl.rules[k][1] r = parse_regex(regex) regexs.append(r) names.append(name) self.lexer = Lexer(regexs, names) def is_indentation_based(self): return self.indentation_based def lex(self, text): tokens = self.lexer.tokenize(text) return self.reformat_tokens(tokens) def reformat_tokens(self, tokens): l = [] for t in tokens: l.append((t.source, t.name)) return l def relex_import(self, startnode, version = 0): """ Replace a node with the tokens of its name :param startnode: node to expand :param version: version assigned to each created node :return: """ success = self.lex(startnode.symbol.name) bos = startnode.prev_term # bos startnode.parent.remove_child(startnode) parent = bos.parent eos = parent.children.pop() last_node = bos for match in success: node = TextNode(Terminal(match[0])) node.version = version node.lookup = match[1] parent.children.append(node) last_node.next_term = node last_node.right = node node.left = last_node node.prev_term = last_node node.parent = parent last_node = node parent.children.append(eos) last_node.right = eos # link to eos last_node.next_term = eos eos.left = last_node eos.prev_term = last_node def split_endcomment(self, node): read_nodes = [node] generated_tokens = [] l = node.symbol.name.split("*/", 1) t1 = self.lexer.tokenize(l[0]) generated_tokens.extend(t1) t2 = self.lexer.tokenize("*/") generated_tokens.extend(t2) if l[1] != "": t3 = self.lexer.tokenize(l[1]) generated_tokens.extend(t3) self.merge_back(read_nodes, generated_tokens) def relex(self, node): """ Relex a nodes environment The environement is: the nodes from first token that may look ahead into node upto find farthest node that has lookahead into node start munching tokens and spit out nodes if generated node already exists => stop (only if we passed edited node) :param node: a node that needs to be relexed :return: """ # find node to start relaxing startnode = node nodes = self.find_preceeding_nodes(node) if nodes: node = nodes[0] if node is startnode: past_startnode = True else: past_startnode = False if isinstance(node, EOS): # nothing to do here, the first node to reparse is the EOS return False # relex read_nodes = [] generated_tokens = [] pos = 0 # read tokens read = 0 # generated tokens current_node = node next_token = self.lexer.get_token_iter(StringWrapper(node)) while True: token = next_token() if token.source == "": read_nodes.append(current_node) break read += len(token.source) # special case when inserting a newline into a string, the lexer # creates a single token. We need to make sure that that newline # gets lexed into its own token if len(token.source) > 1 and token.source.find("\r") >= 0: l = token.source.split("\r") for e in l: t = self.lexer.tokenize(e) generated_tokens.extend(t) if e is not l[-1]: newline = self.lexer.tokenize("\r") generated_tokens.extend(newline) else: generated_tokens.append(token) while read > pos + len(current_node.symbol.name): pos += len(current_node.symbol.name) read_nodes.append(current_node) current_node = current_node.next_term if current_node is startnode: past_startnode = True if past_startnode and read == pos + len(current_node.symbol.name): read_nodes.append(current_node) break return self.merge_back(read_nodes, generated_tokens) def merge_back(self, read_nodes, generated_tokens): """ Replace the symbols in the nodes with the newly generated tokens. We loop over read_nodes and generated tokens at the same pace and replace the read_node.symbol.name with the corresponding generated_token.source. We also update the node's lookup (the type of token). I If it is changed, the node is marked changed (A node whom lookup did not change is identical to the parser) If the arrays are of unequal length: - If the length of generated_tokens is insufficient, we add extra nodes - Excess nodes are removed :param read_nodes: Nodes that have been read by the relexer :param generated_tokens: Tokens that have been found during relexing :return: """ any_changes = False # insert new nodes into tree it = iter(read_nodes) for t in generated_tokens: try: node = it.next() except StopIteration: node = TextNode(Terminal("")) last_node.insert_after(node) any_changes = True last_node = node node.symbol.name = t.source node.indent = None if node.lookup != t.name: node.mark_changed() any_changes = True else: node.mark_version() # we need to invalidate the newline if we changed whitespace or # logical nodes that come after it if node.lookup == "<ws>" or node.lookup != t.name: prev = node.prev_term while isinstance(prev.symbol, IndentationTerminal): prev = prev.prev_term if prev.lookup == "<return>": prev.mark_changed() any_changes = True elif isinstance(prev, BOS): # if there is no return, re-indentation won't be triggered # in the incremental parser so we have to mark the next # terminal. possibly only use case: bos <ws> pass DEDENT eos node.next_term.mark_changed() # XXX this should become neccessary with incparse optimisations turned on if node.lookup == "\\" and node.next_term.lookup == "<return>": node.next_term.mark_changed() any_changes = True node.lookup = t.name node.lookahead = t.lookahead # delete left over nodes while True: try: node = it.next() node.parent.remove_child(node) any_changes = True except StopIteration: break return any_changes def find_preceeding_nodes(self, node): """ Traverses backward in the line to find the nodes that have a lookahead into the given node :param node: the (asjusted) node to start form :return: a list of nodes that have a look ahead into the given node """ chars = 0 nodes = [] if node.symbol.name == "\r": # if at line beginning there are no previous nodes to consider return nodes while True: node = node.prev_term if node.lookahead and node.lookahead > chars: nodes.insert(0, node) chars += len(node.symbol.name) else: break return nodes