Exemple #1
0
    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for k, _ in rules:
            regex = k
            name = pl.rules[k][1]
            r = parse_regex(regex)
            regexs.append(r)
            names.append(name)
        self.lexer = Lexer(regexs, names)
Exemple #2
0
 def setup_class(cls):
     from rpython.rlib.parsing.parsing import PackratParser
     regexs, rules, ToAST = parse_ebnf(grammar)
     cls.ToAST = ToAST()
     cls.parser = PackratParser(rules, rules[0].nonterminal)
     cls.regexs = regexs
     names, regexs = zip(*regexs)
     cls.lexer = Lexer(list(regexs), list(names))
Exemple #3
0
    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for k, _ in rules:
            regex = k
            name = pl.rules[k][1]
            r = parse_regex(regex)
            regexs.append(r)
            names.append(name)
        self.lexer = Lexer(regexs, names)
Exemple #4
0
def make_lexer():
    return Lexer([parse_regex(globals()[r]) for r in tokens], tokens[:])
Exemple #5
0
 def from_name_and_regex(self, names, regexs):
     parsed_regexs = []
     for regex in regexs:
         r = parse_regex(regex)
         parsed_regexs.append(r)
     self.lexer = Lexer(parsed_regexs, names)
Exemple #6
0
class IncrementalLexerCF(object):
    def __init__(self, rules=None, language=""):
        self.indentation_based = False
        if rules:
            if rules.startswith("%"):
                config_line = rules.splitlines()[0]     # get first line
                self.parse_config(config_line[1:])      # remove %
                rules = "\n".join(rules.splitlines()[1:]) # remove config line
            self.createDFA(rules)

    def parse_config(self, config):
        settings = config.split(",")
        for s in settings:
            name, value = s.split("=")
            if name == "indentation" and value == "true":
                self.indentation_based = True

    def from_name_and_regex(self, names, regexs):
        parsed_regexs = []
        for regex in regexs:
            r = parse_regex(regex)
            parsed_regexs.append(r)
        self.lexer = Lexer(parsed_regexs, names)

    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for k, _ in rules:
            regex = k
            name = pl.rules[k][1]
            r = parse_regex(regex)
            regexs.append(r)
            names.append(name)
        self.lexer = Lexer(regexs, names)

    def is_indentation_based(self):
        return self.indentation_based

    def lex(self, text):
        tokens = self.lexer.tokenize(text)
        return self.reformat_tokens(tokens)

    def reformat_tokens(self, tokens):
        l = []
        for t in tokens:
            l.append((t.source, t.name))
        return l

    def relex_import(self, startnode, version = 0):
        success = self.lex(startnode.symbol.name)
        bos = startnode.prev_term # bos
        startnode.parent.remove_child(startnode)
        parent = bos.parent
        eos = parent.children.pop()
        last_node = bos
        for match in success:
            node = TextNode(Terminal(match[0]))
            node.version = version
            node.lookup = match[1]
            parent.children.append(node)
            last_node.next_term = node
            last_node.right = node
            node.left = last_node
            node.prev_term = last_node
            node.parent = parent
            last_node = node
        parent.children.append(eos)
        last_node.right = eos # link to eos
        last_node.next_term = eos
        eos.left = last_node
        eos.prev_term = last_node


    def split_endcomment(self, node):
        read_nodes = [node]
        generated_tokens = []
        l = node.symbol.name.split("*/", 1)
        t1 = self.lexer.tokenize(l[0])
        generated_tokens.extend(t1)
        t2 = self.lexer.tokenize("*/")
        generated_tokens.extend(t2)
        if l[1] != "":
            t3 = self.lexer.tokenize(l[1])
            generated_tokens.extend(t3)

        self.merge_back(read_nodes, generated_tokens)

    def relex(self, node):
        # find farthest node that has lookahead into node
        # start munching tokens and spit out nodes
        #     if generated node already exists => stop
        #     (only if we passed edited node)

        # find node to start relaxing
        startnode = node
        nodes = self.find_preceeding_nodes(node)
        if nodes:
            node = nodes[0]
        if node is startnode:
            past_startnode = True
        else:
            past_startnode = False

        if isinstance(node, EOS):
            # nothing to do here
            return False

        # relex
        read_nodes = []
        generated_tokens = []
        pos = 0  # read tokens
        read = 0 # generated tokens
        current_node = node
        next_token = self.lexer.get_token_iter(StringWrapper(node))
        while True:
            token = next_token()
            if token.source == "":
                read_nodes.append(current_node)
                break
            read += len(token.source)
            # special case when inserting a newline into a string, the lexer
            # creates a single token. We need to make sure that that newline
            # gets lexed into its own token
            if len(token.source) > 1 and token.source.find("\r") >= 0:
                l = token.source.split("\r")
                for e in l:
                    t = self.lexer.tokenize(e)
                    generated_tokens.extend(t)
                    if e is not l[-1]:
                        newline = self.lexer.tokenize("\r")
                        generated_tokens.extend(newline)
            else:
                generated_tokens.append(token)
            while read > pos + len(current_node.symbol.name):
                pos += len(current_node.symbol.name)
                read_nodes.append(current_node)
                current_node = current_node.next_term
                if current_node is startnode:
                    past_startnode = True
            if past_startnode and read == pos + len(current_node.symbol.name):
                read_nodes.append(current_node)
                break

        return self.merge_back(read_nodes, generated_tokens)

    def merge_back(self, read_nodes, generated_tokens):

        any_changes = False
        # insert new nodes into tree
        it = iter(read_nodes)
        for t in generated_tokens:
            try:
                node = it.next()
            except StopIteration:
                node = TextNode(Terminal(""))
                last_node.insert_after(node)
                any_changes = True
            last_node = node
            node.symbol.name = t.source
            node.indent = None
            if node.lookup != t.name:
                node.mark_changed()
                any_changes = True
            else:
                node.mark_version()
            # we need to invalidate the newline if we changed whitespace or
            # logical nodes that come after it
            if node.lookup == "<ws>" or node.lookup != t.name:
                prev = node.prev_term
                while isinstance(prev.symbol, IndentationTerminal):
                    prev = prev.prev_term
                if prev.lookup == "<return>":
                    prev.mark_changed()
                    any_changes = True
                elif isinstance(prev, BOS):
                    # if there is no return, re-indentation won't be triggered
                    # in the incremental parser so we have to mark the next
                    # terminal. possibly only use case: bos <ws> pass DEDENT eos
                    node.next_term.mark_changed()
            # XXX this should become neccessary with incparse optimisations turned on
            if node.lookup == "\\" and node.next_term.lookup == "<return>":
                node.next_term.mark_changed()
                any_changes = True
            node.lookup = t.name
            node.lookahead = t.lookahead
        # delete left over nodes
        while True:
            try:
                node = it.next()
                node.parent.remove_child(node)
                any_changes = True
            except StopIteration:
                break
        return any_changes

    def find_preceeding_nodes(self, node):
        chars = 0
        nodes = []
        if node.symbol.name == "\r": # if at line beginning there are no previous nodes to consider
            return nodes
        while True:
            node = node.prev_term
            if node.lookahead and node.lookahead > chars:
                nodes.insert(0, node)
                chars += len(node.symbol.name)
            else:
                break
        return nodes
Exemple #7
0
 def from_name_and_regex(self, names, regexs):
     parsed_regexs = []
     for regex in regexs:
         r = parse_regex(regex)
         parsed_regexs.append(r)
     self.lexer = Lexer(parsed_regexs, names)
Exemple #8
0
class IncrementalLexerCF(object):
    """
    Incremental lexer that works in accordance with the lexer described in:

    Wagner, Tim A. Practical algorithms for incremental software development environments.
    Diss. University of California, Berkeley, 1997.

    This incremental parser works by relexing the piece of the code that may have been affected. These are all tokens
    that have a look ahead into the altered node (often just one), the token itself until the changes merge.

    @TODO make more clear
    @TODO the merge should use zip

    """
    def __init__(self, rules=None, language=""):
        self.indentation_based = False
        if rules:
            if rules.startswith("%"):
                config_line = rules.splitlines()[0]     # get first line
                self.parse_config(config_line[1:])      # remove %
                rules = "\n".join(rules.splitlines()[1:]) # remove config line
            self.createDFA(rules)

    def parse_config(self, config):
        settings = config.split(",")
        for s in settings:
            name, value = s.split("=")
            if name == "indentation" and value == "true":
                self.indentation_based = True

    def from_name_and_regex(self, names, regexs):
        parsed_regexs = []
        for regex in regexs:
            r = parse_regex(regex)
            parsed_regexs.append(r)
        self.lexer = Lexer(parsed_regexs, names)

    def createDFA(self, rules):
        # lex lexing rules
        pl = PriorityLexer(rules)
        rules = sorted(pl.rules.items(), key=lambda node: node[1][0]) # sort by priority

        # create lexer automaton from rules
        regexs = []
        names = []
        for k, _ in rules:
            regex = k
            name = pl.rules[k][1]
            r = parse_regex(regex)
            regexs.append(r)
            names.append(name)
        self.lexer = Lexer(regexs, names)

    def is_indentation_based(self):
        return self.indentation_based

    def lex(self, text):
        tokens = self.lexer.tokenize(text)
        return self.reformat_tokens(tokens)

    def reformat_tokens(self, tokens):
        l = []
        for t in tokens:
            l.append((t.source, t.name))
        return l

    def relex_import(self, startnode, version = 0):
        """
        Replace a node with the tokens of its name
        :param startnode: node to expand
        :param version: version assigned to each created node
        :return:
        """
        success = self.lex(startnode.symbol.name)
        bos = startnode.prev_term # bos
        startnode.parent.remove_child(startnode)
        parent = bos.parent
        eos = parent.children.pop()
        last_node = bos
        for match in success:
            node = TextNode(Terminal(match[0]))
            node.version = version
            node.lookup = match[1]
            parent.children.append(node)
            last_node.next_term = node
            last_node.right = node
            node.left = last_node
            node.prev_term = last_node
            node.parent = parent
            last_node = node
        parent.children.append(eos)
        last_node.right = eos # link to eos
        last_node.next_term = eos
        eos.left = last_node
        eos.prev_term = last_node


    def split_endcomment(self, node):
        read_nodes = [node]
        generated_tokens = []
        l = node.symbol.name.split("*/", 1)
        t1 = self.lexer.tokenize(l[0])
        generated_tokens.extend(t1)
        t2 = self.lexer.tokenize("*/")
        generated_tokens.extend(t2)
        if l[1] != "":
            t3 = self.lexer.tokenize(l[1])
            generated_tokens.extend(t3)

        self.merge_back(read_nodes, generated_tokens)

    def relex(self, node):
        """
        Relex a nodes environment
        The environement is: the nodes from first token that may look ahead into node upto

         find farthest node that has lookahead into node
         start munching tokens and spit out nodes
             if generated node already exists => stop
             (only if we passed edited node)

        :param node: a node that needs to be relexed
        :return:
        """
        # find node to start relaxing
        startnode = node
        nodes = self.find_preceeding_nodes(node)
        if nodes:
            node = nodes[0]
        if node is startnode:
            past_startnode = True
        else:
            past_startnode = False

        if isinstance(node, EOS):
            # nothing to do here, the first node to reparse is the EOS
            return False

        # relex
        read_nodes = []
        generated_tokens = []
        pos = 0  # read tokens
        read = 0 # generated tokens
        current_node = node
        next_token = self.lexer.get_token_iter(StringWrapper(node))
        while True:
            token = next_token()
            if token.source == "":
                read_nodes.append(current_node)
                break
            read += len(token.source)
            # special case when inserting a newline into a string, the lexer
            # creates a single token. We need to make sure that that newline
            # gets lexed into its own token
            if len(token.source) > 1 and token.source.find("\r") >= 0:
                l = token.source.split("\r")
                for e in l:
                    t = self.lexer.tokenize(e)
                    generated_tokens.extend(t)
                    if e is not l[-1]:
                        newline = self.lexer.tokenize("\r")
                        generated_tokens.extend(newline)
            else:
                generated_tokens.append(token)
            while read > pos + len(current_node.symbol.name):
                pos += len(current_node.symbol.name)
                read_nodes.append(current_node)
                current_node = current_node.next_term
                if current_node is startnode:
                    past_startnode = True
            if past_startnode and read == pos + len(current_node.symbol.name):
                read_nodes.append(current_node)
                break

        return self.merge_back(read_nodes, generated_tokens)

    def merge_back(self, read_nodes, generated_tokens):
        """
        Replace the symbols in the nodes with the newly generated tokens.


        We loop over read_nodes and generated tokens at the same pace and replace the read_node.symbol.name with
        the corresponding generated_token.source. We also update the node's lookup (the type of token). I
        If it is changed, the node is marked changed (A node whom lookup did not change is identical to the parser)

        If the arrays are of unequal length:
          - If the length of generated_tokens is insufficient, we add extra nodes
          - Excess nodes are removed

        :param read_nodes: Nodes that have been read by the relexer
        :param generated_tokens: Tokens that have been found during relexing
        :return:
        """
        any_changes = False
        # insert new nodes into tree
        it = iter(read_nodes)
        for t in generated_tokens:
            try:
                node = it.next()
            except StopIteration:
                node = TextNode(Terminal(""))
                last_node.insert_after(node)
                any_changes = True
            last_node = node
            node.symbol.name = t.source
            node.indent = None
            if node.lookup != t.name:
                node.mark_changed()
                any_changes = True
            else:
                node.mark_version()
            # we need to invalidate the newline if we changed whitespace or
            # logical nodes that come after it
            if node.lookup == "<ws>" or node.lookup != t.name:
                prev = node.prev_term
                while isinstance(prev.symbol, IndentationTerminal):
                    prev = prev.prev_term
                if prev.lookup == "<return>":
                    prev.mark_changed()
                    any_changes = True
                elif isinstance(prev, BOS):
                    # if there is no return, re-indentation won't be triggered
                    # in the incremental parser so we have to mark the next
                    # terminal. possibly only use case: bos <ws> pass DEDENT eos
                    node.next_term.mark_changed()
            # XXX this should become neccessary with incparse optimisations turned on
            if node.lookup == "\\" and node.next_term.lookup == "<return>":
                node.next_term.mark_changed()
                any_changes = True
            node.lookup = t.name
            node.lookahead = t.lookahead
        # delete left over nodes
        while True:
            try:
                node = it.next()
                node.parent.remove_child(node)
                any_changes = True
            except StopIteration:
                break
        return any_changes

    def find_preceeding_nodes(self, node):
        """
        Traverses backward in the line to find the nodes that have a lookahead into the given node
        :param node: the (asjusted) node to start form
        :return: a list of nodes that have a look ahead into the given node
        """
        chars = 0
        nodes = []
        if node.symbol.name == "\r": # if at line beginning there are no previous nodes to consider
            return nodes
        while True:
            node = node.prev_term
            if node.lookahead and node.lookahead > chars:
                nodes.insert(0, node)
                chars += len(node.symbol.name)
            else:
                break
        return nodes