コード例 #1
0
    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for regex, _ in rules:
            name = pl.rules[regex][1]
            regexs.append(regex)
            names.append(name)
        self.lexer = Lexer(zip(names, regexs))
コード例 #2
0
 def from_name_and_regex(self, names, regexs):
     self.lexer = Lexer(zip(names, regexs))
コード例 #3
0
class IncrementalLexerCF(object):
    """
    Incrementally relexes nodes within the parse tree that have been changed.

    When a node changes we need to relex that node and all nodes that are
    dependent on it. This includes nodes before and after the altered node.
    Previous nodes are found by observing their lookaheads. If it reaches the
    changed node they are dependent on it and need to be relexed as well.

    Relexing starts at the earliest node with lookahead into the changed node,
    and continues until the changed node has been passed and relexing doesn't lead
    to any more changes.

    Afterwards the new nodes are merged back into the parse tree, replacing all
    previously relexed nodes.
    """
    def __init__(self, rules=None, language=""):
        self.indentation_based = False
        self.relexed = set()
        if rules:
            if rules.startswith("%"):
                config_line = rules.splitlines()[0]     # get first line
                self.parse_config(config_line[1:])      # remove %
                rules = "\n".join(rules.splitlines()[1:]) # remove config line
            self.createDFA(rules)

    def parse_config(self, config):
        settings = config.split(",")
        for s in settings:
            name, value = s.split("=")
            if name == "indentation" and value == "true":
                self.indentation_based = True

    def from_name_and_regex(self, names, regexs):
        self.lexer = Lexer(zip(names, regexs))

    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for regex, _ in rules:
            name = pl.rules[regex][1]
            regexs.append(regex)
            names.append(name)
        self.lexer = Lexer(zip(names, regexs))

    def is_indentation_based(self):
        return self.indentation_based

    def lex(self, text):
        tokens = self.lexer.lex(text)
        return self.reformat_tokens(tokens)

    def reformat_tokens(self, tokens):
        l = []
        for t in tokens:
            l.append((t[0], t[1]))
        return l

    def relex_import(self, startnode, version = 0):
        """Optimised relex for freshly imported files."""
        success = self.lex(startnode.symbol.name)
        bos = startnode.prev_term # bos
        parent = bos.parent
        eos = parent.children.pop()
        last_node = bos
        for match in success:
            if match is success[0]:
                # reuse old node for fist node to mimic the behaviour of a
                # normal relex
                node = startnode
                node.symbol.name = match[0]
            else:
                node = TextNode(Terminal(match[0]))
            node.lookup = match[1]
            parent.children.append(node)
            last_node.next_term = node
            last_node.right = node
            node.left = last_node
            node.prev_term = last_node
            node.parent = parent
            last_node = node
            node.mark_changed()
        parent.children.append(eos)
        last_node.right = eos # link to eos
        last_node.next_term = eos
        eos.left = last_node
        eos.prev_term = last_node
        bos.mark_changed()
        eos.mark_changed()
        parent.mark_changed()

    def relex(self, node):
        # find farthest node that has lookahead into node
        # start munching tokens and spit out nodes
        #     if generated node already exists => stop
        #     (only if we passed edited node)

        self.relexed = set()

        if type(node.parent) is MultiTextNode:
            # When changing a node within a MultiNode we need to relex the
            # MultiNode
            node = node.parent

        # find node to start relaxing
        startnode = node
        node = self.find_preceeding_node(node)

        while isinstance(node.symbol, IndentationTerminal):
            node = node.next_term

        if node is startnode:
            past_startnode = True
        else:
            past_startnode = False

        if isinstance(node, EOS):
            # nothing to do here
            return False

        # relex
        read_nodes = []
        generated_tokens = []
        pos = 0  # read tokens
        read = 0 # generated tokens
        current_node = node
        next_token = self.lexer.get_token_iter(node).next

        combos = []
        last_read = None

        tokenslength = 0
        readlength = 0
        toks = []
        read = []
        pairs = []
        lookaheads = []
        error = None

        i = 0
        while True:
            try:
                token = next_token()
                lookaheads.append(token[2])
                if not past_startnode:
                    for temp in token[3]:
                        if temp is startnode:
                            past_startnode = True
                            break
                toks.append([x for i,x in enumerate(token) if i != 3])
                tokenslength += tokenlen(token[0])
                for r in token[3]:
                    if not read or r is not read[-1]: # skip already read nodes from previous tokens
                        read.append(r)
                        if not isinstance(r.symbol, IndentationTerminal):
                            readlength += getlength(r)
                if tokenslength == readlength:
                    # Abort relexing if we relexed a node to itself AFTER we
                    # passed `startnode`. This way we avoid relexing nodes that
                    # don't need to be relexed.
                    if past_startnode and read[-1] is not startnode:
                        if len(token[3]) == 1:
                            assert r is token[3][0]
                            if r.symbol.name == token[0] and r.lookup == token[1]:
                                toks.pop()
                                read.pop()
                                break

                    # if new generated tokens match the read tokens, we have a pair
            except StopIteration:
                break
            except LexingError as e:
                if read and type(read[-1]) is MultiTextNode:
                    pairs = []
                    startnode.changed = True
                    raise e
                # Lexer failed to repair everything. See if it managed to lex
                # parts of the changes (toks contains tokens) and if so
                # integrate them into the parse tree. The partly lexed tokens
                # will have bigger lookaheads than usual as they depend on the
                # text parts that couldn't be relexed.
                # Might involve splitting up a node resulting in leftover text
                # that couldn't be lexed as this point. Put that text into a new
                # node and also separate any newlines contained within.
                error = e
                if toks:
                    leftover = readlength - tokenslength
                    if leftover > 0:
                        name = read[-1].symbol.name[-leftover:]
                        l = re.split("(\r)", name)
                        for e in l:
                            if e == "":
                                # Splitting consecutive newlines yields
                                # additional empty strings in the result. Don't
                                # add them into the tree. See
                                # Test_Relexing::test_lexingerror_bug.
                                continue
                            toks.append((e, "<E>", 1))
                    pairs.append((toks, read))
                else:
                    # There are no part matches so remark the startnode as error
                    startnode.changed = True
                if not past_startnode:
                    # When a lexing error occurs before we reached the newly
                    # inserted node (startnode) try to continue lexing from
                    # startnode onwards.
                    # See Test_Relexing::test_newline_after_error
                    next_token = self.lexer.get_token_iter(startnode).next
                    past_startnode = True
                    continue
                break

        if not toks:
            # If there is nothing to merge either re-raise the LexingError if
            # there was one or return False (=no changes)
            if error:
                raise error
            else:
                return False

        changed = False
        # We have to remember the location at which we started relexing. This
        # allows us to properly update all lookback values, even if nodes have
        # been inserted before the starting node or nodes were moved into a
        # multitext node. Otherwise we might only update some of the nodes.
        if read[0].ismultichild():
            node_before_changes = read[0].parent.prev_term
        else:
            node_before_changes = read[0].prev_term
        if self.merge_back(toks, read):
            changed = True

        # update lookback counts using lookaheads
        self.update_lookback(node_before_changes.next_term, startnode)

        if error:
            raise error
        return changed

    def update_lookback(self, node, startnode):
        n = node
        la_list = []
        past_node = False
        while True:
            if n is startnode:
                past_node = True
            # indentation tokens are skipped in StringWrapper, so skip them here
            # as well
            while isinstance(n.symbol, IndentationTerminal):
                n = n.next_term
            if isinstance(n, EOS):
                break
            # compute lookback (removes old lookbacks)
            la_list = [(name, la, cnt) for name, la, cnt in la_list if la > 0]
            newlookback = max(la_list, key=lambda item:item[2])[2] if la_list else 0
            if not self.was_relexed(n) and n.lookback == newlookback and past_node:
                break
            n.lookback = newlookback

            # advance
            offset = getlength(n)
            la_list = [(name, la - offset, cnt+1) for name, la, cnt in la_list]

            # add
            la_list.append((n.symbol.name, n.lookahead, 1))

            n = n.next_term

    def was_relexed(self, node):
        return node in self.relexed

    def iter_gen(self, tokens):
        for t in tokens:
            if type(t[0]) is list:
                yield ("new mt", t[1], t[2])
                for x in t[0]:
                    yield (x, t[1], t[2])
                yield ("finish mt", None, None)
            else:
                yield t
        while True:
            yield None

    def iter_read(self, nodes):
        for n in nodes:
            if isinstance(n, MultiTextNode):
                # since we are removing elements from the original list during
                # iteration we need to create a copy to not skip anything
                for x in list(n.children):
                    yield x
            else:
                yield n
        while True:
            yield None

    def remove_check(self, node):
        if isinstance(node.parent, MultiTextNode):
            if len(node.parent.children) == 0:
                node.parent.remove()
            else:
                node.parent.update_children()

    def merge_back(self, tokens, read):
        if len(tokens) == 1 and tokens[0][0] == "\x81":
            return False

        lastread = read[0].prev_term

        it_gen = self.iter_gen(tokens)
        it_read = self.iter_read(read)

        gen = it_gen.next()
        read = it_read.next()

        totalr = 0
        totalg = 0

        reused = set()
        current_mt = None
        changed = False

        while True:
            while read is not None and isinstance(read.symbol, IndentationTerminal):
                read.remove()
                read = it_read.next()
            if gen is None and read is None:
                break

            if read and read.deleted:
                read = it_read.next()
                continue

            if gen is None:
                lengen = 0
            elif gen[0] == "new mt":
                if read and read.ismultichild() and not read.parent in reused:
                    current_mt = read.parent # reuse
                else:
                    current_mt = MultiTextNode() # create new
                    lastread.insert_after(current_mt) # insert multiline under same parent as the nodes it replaces
                    changed = True
                if current_mt.lookup != gen[1]:
                    changed = True
                current_mt.lookup = gen[1]
                current_mt.lookahead = gen[2]
                self.relexed.add(current_mt)
                gen = it_gen.next()
                continue
            elif gen[0] == "finish mt":
                reused.add(current_mt)
                lastread = current_mt
                gen = it_gen.next()
                current_mt.update_children()
                current_mt = None
                continue
            else:
                lengen = len(gen[0])

            if totalr >= totalg + lengen:
                changed = True
                # One node has been split into multiple nodes. Insert all
                # remaining nodes until the lengths add up again.
                new = TextNode(Terminal(gen[0]))
                self.relexed.add(new)
                new.lookup = gen[1]
                if new.lookup == "<E>":
                    # If this token comes from the leftovers of a LexingError,
                    # mark it appropriately
                    new.changed = True  # XXX with error recovery, mark as error
                new.lookahead = gen[2]
                if current_mt and not lastread.ismultichild():
                    current_mt.insert_at_beginning(new)
                else:
                    lastread.insert_after(new)
                lastread = new
                totalg += lengen
                gen = it_gen.next()
            elif totalr + getlength(read) <= totalg:
                changed = True
                # Multiple nodes have been combined into less nodes. Delete old
                # nodes until the lengths add up again.
                read.remove()
                self.remove_check(read)
                totalr += getlength(read)
                read = it_read.next()
            else:
                # Overwrite old nodes with updated values. Move nodes in or out
                # of multinodes if needed.
                totalr += getlength(read)
                totalg += lengen
                if read.lookup != gen[1]:
                    read.mark_changed()
                    self.relexed.add(read)
                    changed = True
                else:
                    read.mark_changed()
                if not isinstance(read.symbol, MagicTerminal):
                    read.symbol.name = gen[0].replace("\x81", "")
                    read.lookup = gen[1]
                    read.lookahead = gen[2]
                    self.relexed.add(read)
                else:
                    read.lookup = gen[1]
                if not current_mt:
                    if read.ismultichild():
                        # Read node was previously part of a multinode but has
                        # been updated to a normal node. Remove it from the
                        # multinode.
                        read.remove(True)
                        read.deleted = False
                        self.remove_check(read)
                        lastread.insert_after(read)
                        changed = True
                else:
                    if not read.ismultichild() or current_mt is not read.parent:
                        # Read node has been moved from a normal node into a
                        # multinode or from one multinode into another
                        # multinode. Remove from old locations and insert into
                        # new location.
                        read.remove(True)
                        read.deleted = False
                        self.remove_check(read)
                        if current_mt.isempty():
                            current_mt.set_children([read])
                        else:
                            lastread.insert_after(read)
                        changed = True
                lastread = read
                read = it_read.next()
                gen = it_gen.next()

        return changed

    def find_preceeding_node(self, node):
        original = node
        if node.lookback == -1:
            node = node.prev_term
            while isinstance(node.symbol, IndentationTerminal):
                node = node.prev_term
        if isinstance(node.symbol, MagicTerminal) and node.lookback <= 0:
            # Token was created next to a language box and the language box is
            # not part of an in-progress string/comment.
            return original
        for i in range(node.lookback):
            while isinstance(node.symbol, IndentationTerminal):
                node = node.prev_term
            node = node.prev_term
        if type(node) is BOS:
            node = node.next_term
        return node