def createDFA(self, rules): # lex lexing rules pl = PriorityLexer(rules) rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority # create lexer automaton from rules regexs = [] names = [] for regex, _ in rules: name = pl.rules[regex][1] regexs.append(regex) names.append(name) self.lexer = Lexer(zip(names, regexs))
def from_name_and_regex(self, names, regexs): self.lexer = Lexer(zip(names, regexs))
class IncrementalLexerCF(object): """ Incrementally relexes nodes within the parse tree that have been changed. When a node changes we need to relex that node and all nodes that are dependent on it. This includes nodes before and after the altered node. Previous nodes are found by observing their lookaheads. If it reaches the changed node they are dependent on it and need to be relexed as well. Relexing starts at the earliest node with lookahead into the changed node, and continues until the changed node has been passed and relexing doesn't lead to any more changes. Afterwards the new nodes are merged back into the parse tree, replacing all previously relexed nodes. """ def __init__(self, rules=None, language=""): self.indentation_based = False self.relexed = set() if rules: if rules.startswith("%"): config_line = rules.splitlines()[0] # get first line self.parse_config(config_line[1:]) # remove % rules = "\n".join(rules.splitlines()[1:]) # remove config line self.createDFA(rules) def parse_config(self, config): settings = config.split(",") for s in settings: name, value = s.split("=") if name == "indentation" and value == "true": self.indentation_based = True def from_name_and_regex(self, names, regexs): self.lexer = Lexer(zip(names, regexs)) def createDFA(self, rules): # lex lexing rules pl = PriorityLexer(rules) rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority # create lexer automaton from rules regexs = [] names = [] for regex, _ in rules: name = pl.rules[regex][1] regexs.append(regex) names.append(name) self.lexer = Lexer(zip(names, regexs)) def is_indentation_based(self): return self.indentation_based def lex(self, text): tokens = self.lexer.lex(text) return self.reformat_tokens(tokens) def reformat_tokens(self, tokens): l = [] for t in tokens: l.append((t[0], t[1])) return l def relex_import(self, startnode, version = 0): """Optimised relex for freshly imported files.""" success = self.lex(startnode.symbol.name) bos = startnode.prev_term # bos parent = bos.parent eos = parent.children.pop() last_node = bos for match in success: if match is success[0]: # reuse old node for fist node to mimic the behaviour of a # normal relex node = startnode node.symbol.name = match[0] else: node = TextNode(Terminal(match[0])) node.lookup = match[1] parent.children.append(node) last_node.next_term = node last_node.right = node node.left = last_node node.prev_term = last_node node.parent = parent last_node = node node.mark_changed() parent.children.append(eos) last_node.right = eos # link to eos last_node.next_term = eos eos.left = last_node eos.prev_term = last_node bos.mark_changed() eos.mark_changed() parent.mark_changed() def relex(self, node): # find farthest node that has lookahead into node # start munching tokens and spit out nodes # if generated node already exists => stop # (only if we passed edited node) self.relexed = set() if type(node.parent) is MultiTextNode: # When changing a node within a MultiNode we need to relex the # MultiNode node = node.parent # find node to start relaxing startnode = node node = self.find_preceeding_node(node) while isinstance(node.symbol, IndentationTerminal): node = node.next_term if node is startnode: past_startnode = True else: past_startnode = False if isinstance(node, EOS): # nothing to do here return False # relex read_nodes = [] generated_tokens = [] pos = 0 # read tokens read = 0 # generated tokens current_node = node next_token = self.lexer.get_token_iter(node).next combos = [] last_read = None tokenslength = 0 readlength = 0 toks = [] read = [] pairs = [] lookaheads = [] error = None i = 0 while True: try: token = next_token() lookaheads.append(token[2]) if not past_startnode: for temp in token[3]: if temp is startnode: past_startnode = True break toks.append([x for i,x in enumerate(token) if i != 3]) tokenslength += tokenlen(token[0]) for r in token[3]: if not read or r is not read[-1]: # skip already read nodes from previous tokens read.append(r) if not isinstance(r.symbol, IndentationTerminal): readlength += getlength(r) if tokenslength == readlength: # Abort relexing if we relexed a node to itself AFTER we # passed `startnode`. This way we avoid relexing nodes that # don't need to be relexed. if past_startnode and read[-1] is not startnode: if len(token[3]) == 1: assert r is token[3][0] if r.symbol.name == token[0] and r.lookup == token[1]: toks.pop() read.pop() break # if new generated tokens match the read tokens, we have a pair except StopIteration: break except LexingError as e: if read and type(read[-1]) is MultiTextNode: pairs = [] startnode.changed = True raise e # Lexer failed to repair everything. See if it managed to lex # parts of the changes (toks contains tokens) and if so # integrate them into the parse tree. The partly lexed tokens # will have bigger lookaheads than usual as they depend on the # text parts that couldn't be relexed. # Might involve splitting up a node resulting in leftover text # that couldn't be lexed as this point. Put that text into a new # node and also separate any newlines contained within. error = e if toks: leftover = readlength - tokenslength if leftover > 0: name = read[-1].symbol.name[-leftover:] l = re.split("(\r)", name) for e in l: if e == "": # Splitting consecutive newlines yields # additional empty strings in the result. Don't # add them into the tree. See # Test_Relexing::test_lexingerror_bug. continue toks.append((e, "<E>", 1)) pairs.append((toks, read)) else: # There are no part matches so remark the startnode as error startnode.changed = True if not past_startnode: # When a lexing error occurs before we reached the newly # inserted node (startnode) try to continue lexing from # startnode onwards. # See Test_Relexing::test_newline_after_error next_token = self.lexer.get_token_iter(startnode).next past_startnode = True continue break if not toks: # If there is nothing to merge either re-raise the LexingError if # there was one or return False (=no changes) if error: raise error else: return False changed = False # We have to remember the location at which we started relexing. This # allows us to properly update all lookback values, even if nodes have # been inserted before the starting node or nodes were moved into a # multitext node. Otherwise we might only update some of the nodes. if read[0].ismultichild(): node_before_changes = read[0].parent.prev_term else: node_before_changes = read[0].prev_term if self.merge_back(toks, read): changed = True # update lookback counts using lookaheads self.update_lookback(node_before_changes.next_term, startnode) if error: raise error return changed def update_lookback(self, node, startnode): n = node la_list = [] past_node = False while True: if n is startnode: past_node = True # indentation tokens are skipped in StringWrapper, so skip them here # as well while isinstance(n.symbol, IndentationTerminal): n = n.next_term if isinstance(n, EOS): break # compute lookback (removes old lookbacks) la_list = [(name, la, cnt) for name, la, cnt in la_list if la > 0] newlookback = max(la_list, key=lambda item:item[2])[2] if la_list else 0 if not self.was_relexed(n) and n.lookback == newlookback and past_node: break n.lookback = newlookback # advance offset = getlength(n) la_list = [(name, la - offset, cnt+1) for name, la, cnt in la_list] # add la_list.append((n.symbol.name, n.lookahead, 1)) n = n.next_term def was_relexed(self, node): return node in self.relexed def iter_gen(self, tokens): for t in tokens: if type(t[0]) is list: yield ("new mt", t[1], t[2]) for x in t[0]: yield (x, t[1], t[2]) yield ("finish mt", None, None) else: yield t while True: yield None def iter_read(self, nodes): for n in nodes: if isinstance(n, MultiTextNode): # since we are removing elements from the original list during # iteration we need to create a copy to not skip anything for x in list(n.children): yield x else: yield n while True: yield None def remove_check(self, node): if isinstance(node.parent, MultiTextNode): if len(node.parent.children) == 0: node.parent.remove() else: node.parent.update_children() def merge_back(self, tokens, read): if len(tokens) == 1 and tokens[0][0] == "\x81": return False lastread = read[0].prev_term it_gen = self.iter_gen(tokens) it_read = self.iter_read(read) gen = it_gen.next() read = it_read.next() totalr = 0 totalg = 0 reused = set() current_mt = None changed = False while True: while read is not None and isinstance(read.symbol, IndentationTerminal): read.remove() read = it_read.next() if gen is None and read is None: break if read and read.deleted: read = it_read.next() continue if gen is None: lengen = 0 elif gen[0] == "new mt": if read and read.ismultichild() and not read.parent in reused: current_mt = read.parent # reuse else: current_mt = MultiTextNode() # create new lastread.insert_after(current_mt) # insert multiline under same parent as the nodes it replaces changed = True if current_mt.lookup != gen[1]: changed = True current_mt.lookup = gen[1] current_mt.lookahead = gen[2] self.relexed.add(current_mt) gen = it_gen.next() continue elif gen[0] == "finish mt": reused.add(current_mt) lastread = current_mt gen = it_gen.next() current_mt.update_children() current_mt = None continue else: lengen = len(gen[0]) if totalr >= totalg + lengen: changed = True # One node has been split into multiple nodes. Insert all # remaining nodes until the lengths add up again. new = TextNode(Terminal(gen[0])) self.relexed.add(new) new.lookup = gen[1] if new.lookup == "<E>": # If this token comes from the leftovers of a LexingError, # mark it appropriately new.changed = True # XXX with error recovery, mark as error new.lookahead = gen[2] if current_mt and not lastread.ismultichild(): current_mt.insert_at_beginning(new) else: lastread.insert_after(new) lastread = new totalg += lengen gen = it_gen.next() elif totalr + getlength(read) <= totalg: changed = True # Multiple nodes have been combined into less nodes. Delete old # nodes until the lengths add up again. read.remove() self.remove_check(read) totalr += getlength(read) read = it_read.next() else: # Overwrite old nodes with updated values. Move nodes in or out # of multinodes if needed. totalr += getlength(read) totalg += lengen if read.lookup != gen[1]: read.mark_changed() self.relexed.add(read) changed = True else: read.mark_changed() if not isinstance(read.symbol, MagicTerminal): read.symbol.name = gen[0].replace("\x81", "") read.lookup = gen[1] read.lookahead = gen[2] self.relexed.add(read) else: read.lookup = gen[1] if not current_mt: if read.ismultichild(): # Read node was previously part of a multinode but has # been updated to a normal node. Remove it from the # multinode. read.remove(True) read.deleted = False self.remove_check(read) lastread.insert_after(read) changed = True else: if not read.ismultichild() or current_mt is not read.parent: # Read node has been moved from a normal node into a # multinode or from one multinode into another # multinode. Remove from old locations and insert into # new location. read.remove(True) read.deleted = False self.remove_check(read) if current_mt.isempty(): current_mt.set_children([read]) else: lastread.insert_after(read) changed = True lastread = read read = it_read.next() gen = it_gen.next() return changed def find_preceeding_node(self, node): original = node if node.lookback == -1: node = node.prev_term while isinstance(node.symbol, IndentationTerminal): node = node.prev_term if isinstance(node.symbol, MagicTerminal) and node.lookback <= 0: # Token was created next to a language box and the language box is # not part of an in-progress string/comment. return original for i in range(node.lookback): while isinstance(node.symbol, IndentationTerminal): node = node.prev_term node = node.prev_term if type(node) is BOS: node = node.next_term return node