Example #1
0
    def parse_string(self, s, concepts=True):
        """
        Parse the string s and return a new hypergraph.
        """

        # Constants to identify items on the stack
        PNODE = 1  # Parent node
        CNODE = 2  # Child node
        EDGE = 3  # Hyperedge

        hgraph = Hgraph()

        stack = []
        state = 0

        self.id_count = 0
        self.nt_id_count = 0
        self.ext_id_count = 0
        self.seen_nodes = set()
        self.explicit_ext_ids = False

        # States of the finite state parser
        #0, top level
        #1, expecting head nodename
        #2, expecting edge label or node
        #3, expecting further child nodes or right paren
        #4, expecting saw edge label, expecting child node, edge label, right paren

        def get_reentrance(s):
            re_pattern = re.compile('[^:](_[0-9]+)\.')
            re_list = re_pattern.findall(s)
            #print re_list
            self.reentrance_indexes.update(re_list)

        def insert_node(node, root=False):
            # Insert a node into the AMR
            ident, label, ext_id = node
            ignoreme = hgraph[ident]  #Initialize dictionary for this node
            hgraph.node_to_concepts[ident] = label
            if ext_id is not None:
                if ident in hgraph.external_nodes and hgraph.external_nodes[
                        ident] != ext_id:
                    raise ParserError, "Incompatible external node IDs for node %s." % ident
                hgraph.external_nodes[ident] = ext_id
                hgraph.rev_external_nodes[ext_id] = ident
            if root:
                hgraph.roots.append(ident)

        def pop_and_transition():
            # Create all edges in a group from the stack, attach them to the
            # graph and then transition to the appropriate state in the FSA
            edges = []
            while stack[-1][0] != PNODE:  # Pop all edges
                children = []
                while stack[-1][0] == CNODE:  # Pop all nodes in hyperedge
                    itemtype, node = stack.pop()
                    insert_node(node)
                    children.append(node)
                assert stack[-1][0] == EDGE
                itemtype, edgelabel = stack.pop()
                edges.append((edgelabel, children))

            # Construct the hyperedge
            itemtype, parentnode = stack.pop()
            for edgelabel, children in edges:
                hypertarget = []  # build hyperedge tail
                for ident, label, ext_id in children:
                    hypertarget.append(ident)
                hypertarget.reverse()
                hyperchild = tuple(hypertarget)

                if "$" in edgelabel:  # this is a nonterminal Edge
                    #print '***********non-terminal %s' % edgelabel
                    new_edge = NonterminalLabel.from_string(edgelabel)
                    if not new_edge.index:
                        new_edge.index = "_%i" % self.nt_id_count
                        self.nt_id_count = self.nt_id_count + 1
                else:
                    #print '***********terminal %s' % edgelabel
                    new_edge = edgelabel
                ident, label, ext_id = parentnode
                hgraph._add_triple(ident, new_edge, hyperchild)

            if stack:
                insert_node(parentnode)
                stack.append((CNODE, parentnode))
                state = 4
            else:
                insert_node(parentnode, root=True)
                state = 5

        get_reentrance(s)

        # Parser transitions start here
        #print 'begin'
        #print s
        #print 'end'
        for typ, token, pos in self.lexer.lex(s):
            #print typ, token, pos, state
            #log.info(typ+ '  ,  '+ token+ '  , '+ (str)(pos))

            if state == 0:
                if typ == LexTypes.LPAR:
                    state = 1
                elif typ == LexTypes.NODE:
                    insert_node(self.parse_node(token), root=True)
                    state = 5
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 1:
                if typ == LexTypes.NODE:
                    stack.append(
                        (PNODE, self.parse_node(token)))  # Push head node
                    state = 2
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 2:
                if typ == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                    state = 4
                elif typ == LexTypes.NODE:
                    stack.append(
                        (EDGE, ""))  # No edge specified, assume empty label
                    stack.append((CNODE, self.parse_node(token)))
                    state = 3
                elif typ == LexTypes.LPAR:
                    stack.append(
                        (EDGE, ""))  # No edge specified, assume empty label
                    state = 1
                elif typ == LexTypes.RPAR:
                    itemtype, node = stack.pop()
                    assert itemtype == PNODE
                    if stack:
                        insert_node(node)
                        stack.append((CNODE, node))
                        state = 3
                    else:
                        insert_node(node, root=True)
                        state = 5
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 3:
                if typ == LexTypes.RPAR:  # Pop from stack and add edges
                    pop_and_transition()
                elif typ == LexTypes.NODE:
                    stack.append((CNODE, self.parse_node(token)))
                    state = 3
                elif typ == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                    state = 4
                elif typ == LexTypes.LPAR:
                    state = 1
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 4:
                if typ == LexTypes.LPAR:
                    state = 1
                elif typ == LexTypes.NODE:
                    stack.append((CNODE, self.parse_node(token)))
                    state = 3
                elif typ == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                elif typ == LexTypes.RPAR:  # Pop from stack and add edges
                    pop_and_transition()
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 5:
                raise ParserError, "Unexpected token %s at position %i." % (
                    token, pos)

        # Normalize external nodes
        new_ext_nodes = {}
        new_rev_ext_nodes = {}
        i = 0
        for node, index in sorted(hgraph.external_nodes.items(),
                                  key=lambda (n, i): i):
            new_ext_nodes[node] = i
            new_rev_ext_nodes[i] = node
            i = i + 1

        hgraph.external_nodes = new_ext_nodes
        hgraph.rev_external_nodes = new_rev_ext_nodes
        return hgraph
Example #2
0
    def parse_string(self, s, concepts=True):
        """
        Parse the string s and return a new abstract meaning representation.

        @concepts if True, method returns an L{Hgraph} object containing concept labels. 
        """

        PNODE = 1
        CNODE = 2
        EDGE = 3

        amr = Hgraph()
        stack = []
        state = 0

        #0, top leve
        #1, expecting source nodename
        #2, expecting concept name or edge label
        #3, lexpecting concept name
        #4, expecting edge label
        #5, expecting expression, node name or literal string, quantity or special symbol
        #6, expecting right paren or more target nodes
        #7, expecting right paren

        for type, token, pos in self.lexer.lex(s):

            if state == 0:
                if type == LexTypes.LPAR:
                    state = 1
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 1:
                if type == LexTypes.IDENTIFIER:
                    stack.append((PNODE, token, None))  # Push source node
                    state = 2
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 2:
                if type == LexTypes.SLASH:
                    state = 3
                elif type == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                    state = 5
                elif type == LexTypes.RPAR:
                    forgetme, parentnodelabel, parentconcept = stack.pop()
                    assert forgetme == PNODE
                    if parentnodelabel[0] == '@':
                        parentnodelabel = parentnodelabel[1:]
                        amr.external_nodes.append(parentnodelabel)
                    foo = amr[parentnodelabel]  # add only the node
                    if stack:
                        stack.append((CNODE, parentnodelabel, parentconcept))
                        state = 6
                    else:
                        amr.roots.append(parentnodelabel)
                        state = 0

                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 3:
                if type == LexTypes.IDENTIFIER:
                    assert stack[-1][0] == PNODE
                    nodelabel = stack.pop()[1]
                    stack.append(
                        (PNODE, nodelabel,
                         token))  # Push new source node with concept label
                    state = 4
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 4:
                if type == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                    state = 5
                elif type == LexTypes.RPAR:
                    forgetme, parentnodelabel, parentconcept = stack.pop()
                    assert forgetme == PNODE
                    if parentnodelabel[0] == '@':
                        parentnodelabel = parentnodelabel[1:]
                        amr.external_nodes.append(parentnodelabel)
                    foo = amr[parentnodelabel]  # add only the node
                    if concepts and (
                            not parentnodelabel in amr.node_to_concepts
                            or parentnodelabel is not None):
                        amr.node_to_concepts[parentnodelabel] = parentconcept
                    if stack:
                        stack.append((CNODE, parentnodelabel, parentconcept))
                        state = 6
                    else:
                        amr.roots.append(parentnodelabel)
                        state = 0
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 5:
                if type == LexTypes.LPAR:
                    state = 1
                elif type == LexTypes.QUANTITY:
                    stack.append((CNODE, Quantity(token), None))
                    state = 6
                elif type == LexTypes.STRLITERAL:
                    stack.append((CNODE, StrLiteral(token[1:-1]), None))
                    state = 6
                elif type == LexTypes.LITERAL:
                    stack.append((CNODE, Literal(token[1:]), None))
                    state = 6
                elif type == LexTypes.IDENTIFIER:
                    stack.append(
                        (CNODE, token,
                         None))  # Push new source node with concept label
                    state = 6
                elif type == LexTypes.EDGELABEL:  # Unary edge
                    stack.append((CNODE, None, None))
                    stack.append((EDGE, token[1:]))
                    state = 5

                elif type == LexTypes.RPAR:  # Unary edge
                    stack.append((CNODE, None, None))
                    edges = []
                    while stack[-1][0] != PNODE:  # Pop all edges
                        children = []
                        while stack[-1][
                                0] == CNODE:  # Pop all external nodes for hyperedge
                            forgetme, childnodelabel, childconcept = stack.pop(
                            )
                            if childnodelabel is not None and childnodelabel[
                                    0] == '@':  #child is external node
                                childnodelabel = childnodelabel[1:]
                                amr.external_nodes.append(childnodelabel)
                            children.append((childnodelabel, childconcept))

                        assert stack[-1][0] == EDGE
                        forgetme, edgelabel = stack.pop()
                        edges.append((edgelabel, children))

                    forgetme, parentnodelabel, parentconcept = stack.pop()
                    if concepts and (
                            not parentnodelabel in amr.node_to_concepts
                            or parentconcept is not None):
                        amr.node_to_concepts[parentnodelabel] = parentconcept
                    if parentnodelabel[0] == '@':  #parent is external node
                        parentnodelabel = parentnodelabel[1:]
                        amr.external_nodes.append(parentnodelabel)
                    for edgelabel, children in edges:

                        hypertarget = []  # build hyperedge destination
                        for node, concept in children:
                            if node is not None:
                                if concepts and (
                                        not node in amr.node_to_concepts
                                        or concept is not None):
                                    amr.node_to_concepts[node] = concept
                                hypertarget.append(node)
                        hyperchild = tuple(hypertarget)

                        if edgelabel[0] == '#':  # this is a nonterminal Edge
                            edgelabel = NonterminalLabel(edgelabel[1:])

                        amr._add_triple(parentnodelabel, edgelabel, hyperchild)

                    if stack:
                        state = 6
                        stack.append((CNODE, parentnodelabel, parentconcept))
                    else:
                        state = 0
                        amr.roots.append(parentnodelabel)

                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 6:
                if type == LexTypes.RPAR:  # Pop from stack and add edges

                    edges = []

                    while stack[-1][0] != PNODE:  # Pop all edges
                        children = []
                        while stack[-1][
                                0] == CNODE:  # Pop all external nodes for hyperedge
                            forgetme, childnodelabel, childconcept = stack.pop(
                            )
                            if childnodelabel is not None and childnodelabel[
                                    0] == '@':  #child is external node
                                childnodelabel = childnodelabel[1:]
                                amr.external_nodes.append(childnodelabel)
                            children.append((childnodelabel, childconcept))

                        assert stack[-1][0] == EDGE
                        forgetme, edgelabel = stack.pop()
                        edges.append((edgelabel, children))

                    forgetme, parentnodelabel, parentconcept = stack.pop()
                    if concepts and (
                            not parentnodelabel in amr.node_to_concepts
                            or parentconcept is not None):
                        amr.node_to_concepts[parentnodelabel] = parentconcept
                    if parentnodelabel[0] == '@':  #parent is external node
                        parentnodelabel = parentnodelabel[1:]
                        amr.external_nodes.append(parentnodelabel)
                    for edgelabel, children in edges:

                        hypertarget = []  # build hyperedge destination
                        for node, concept in children:
                            if node is not None:
                                if concepts and (
                                        not node in amr.node_to_concepts
                                        or concept is not None):
                                    amr.node_to_concepts[node] = concept
                                hypertarget.append(node)
                        hyperchild = tuple(hypertarget)

                        if edgelabel[0] == '#':  # this is a nonterminal Edge
                            edgelabel = NonterminalLabel(edgelabel[1:])
                        amr._add_triple(parentnodelabel, edgelabel, hyperchild)

                    if stack:
                        state = 6
                        stack.append((CNODE, parentnodelabel, parentconcept))
                    else:
                        state = 0
                        amr.roots.append(parentnodelabel)

                elif type == LexTypes.COMMA:
                    state = 7

                elif type == LexTypes.EDGELABEL:
                    stack.append((EDGE, token[1:]))
                    state = 5

                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

            elif state == 7:
                if type == LexTypes.IDENTIFIER:
                    stack.append(
                        (CNODE, token,
                         None))  # Push new source node with concept label
                    state = 6
                elif type == LexTypes.LPAR:
                    state = 1
                else:
                    raise ParserError, "Unexpected token %s at position %i." % (
                        token, pos)

        return amr