def parse_string(self, s, concepts=True): """ Parse the string s and return a new hypergraph. """ # Constants to identify items on the stack PNODE = 1 # Parent node CNODE = 2 # Child node EDGE = 3 # Hyperedge hgraph = Hgraph() stack = [] state = 0 self.id_count = 0 self.nt_id_count = 0 self.ext_id_count = 0 self.seen_nodes = set() self.explicit_ext_ids = False # States of the finite state parser #0, top level #1, expecting head nodename #2, expecting edge label or node #3, expecting further child nodes or right paren #4, expecting saw edge label, expecting child node, edge label, right paren def get_reentrance(s): re_pattern = re.compile('[^:](_[0-9]+)\.') re_list = re_pattern.findall(s) #print re_list self.reentrance_indexes.update(re_list) def insert_node(node, root=False): # Insert a node into the AMR ident, label, ext_id = node ignoreme = hgraph[ident] #Initialize dictionary for this node hgraph.node_to_concepts[ident] = label if ext_id is not None: if ident in hgraph.external_nodes and hgraph.external_nodes[ ident] != ext_id: raise ParserError, "Incompatible external node IDs for node %s." % ident hgraph.external_nodes[ident] = ext_id hgraph.rev_external_nodes[ext_id] = ident if root: hgraph.roots.append(ident) def pop_and_transition(): # Create all edges in a group from the stack, attach them to the # graph and then transition to the appropriate state in the FSA edges = [] while stack[-1][0] != PNODE: # Pop all edges children = [] while stack[-1][0] == CNODE: # Pop all nodes in hyperedge itemtype, node = stack.pop() insert_node(node) children.append(node) assert stack[-1][0] == EDGE itemtype, edgelabel = stack.pop() edges.append((edgelabel, children)) # Construct the hyperedge itemtype, parentnode = stack.pop() for edgelabel, children in edges: hypertarget = [] # build hyperedge tail for ident, label, ext_id in children: hypertarget.append(ident) hypertarget.reverse() hyperchild = tuple(hypertarget) if "$" in edgelabel: # this is a nonterminal Edge #print '***********non-terminal %s' % edgelabel new_edge = NonterminalLabel.from_string(edgelabel) if not new_edge.index: new_edge.index = "_%i" % self.nt_id_count self.nt_id_count = self.nt_id_count + 1 else: #print '***********terminal %s' % edgelabel new_edge = edgelabel ident, label, ext_id = parentnode hgraph._add_triple(ident, new_edge, hyperchild) if stack: insert_node(parentnode) stack.append((CNODE, parentnode)) state = 4 else: insert_node(parentnode, root=True) state = 5 get_reentrance(s) # Parser transitions start here #print 'begin' #print s #print 'end' for typ, token, pos in self.lexer.lex(s): #print typ, token, pos, state #log.info(typ+ ' , '+ token+ ' , '+ (str)(pos)) if state == 0: if typ == LexTypes.LPAR: state = 1 elif typ == LexTypes.NODE: insert_node(self.parse_node(token), root=True) state = 5 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 1: if typ == LexTypes.NODE: stack.append( (PNODE, self.parse_node(token))) # Push head node state = 2 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 2: if typ == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) state = 4 elif typ == LexTypes.NODE: stack.append( (EDGE, "")) # No edge specified, assume empty label stack.append((CNODE, self.parse_node(token))) state = 3 elif typ == LexTypes.LPAR: stack.append( (EDGE, "")) # No edge specified, assume empty label state = 1 elif typ == LexTypes.RPAR: itemtype, node = stack.pop() assert itemtype == PNODE if stack: insert_node(node) stack.append((CNODE, node)) state = 3 else: insert_node(node, root=True) state = 5 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 3: if typ == LexTypes.RPAR: # Pop from stack and add edges pop_and_transition() elif typ == LexTypes.NODE: stack.append((CNODE, self.parse_node(token))) state = 3 elif typ == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) state = 4 elif typ == LexTypes.LPAR: state = 1 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 4: if typ == LexTypes.LPAR: state = 1 elif typ == LexTypes.NODE: stack.append((CNODE, self.parse_node(token))) state = 3 elif typ == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) elif typ == LexTypes.RPAR: # Pop from stack and add edges pop_and_transition() else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 5: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) # Normalize external nodes new_ext_nodes = {} new_rev_ext_nodes = {} i = 0 for node, index in sorted(hgraph.external_nodes.items(), key=lambda (n, i): i): new_ext_nodes[node] = i new_rev_ext_nodes[i] = node i = i + 1 hgraph.external_nodes = new_ext_nodes hgraph.rev_external_nodes = new_rev_ext_nodes return hgraph
def parse_string(self, s, concepts=True): """ Parse the string s and return a new abstract meaning representation. @concepts if True, method returns an L{Hgraph} object containing concept labels. """ PNODE = 1 CNODE = 2 EDGE = 3 amr = Hgraph() stack = [] state = 0 #0, top leve #1, expecting source nodename #2, expecting concept name or edge label #3, lexpecting concept name #4, expecting edge label #5, expecting expression, node name or literal string, quantity or special symbol #6, expecting right paren or more target nodes #7, expecting right paren for type, token, pos in self.lexer.lex(s): if state == 0: if type == LexTypes.LPAR: state = 1 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 1: if type == LexTypes.IDENTIFIER: stack.append((PNODE, token, None)) # Push source node state = 2 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 2: if type == LexTypes.SLASH: state = 3 elif type == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) state = 5 elif type == LexTypes.RPAR: forgetme, parentnodelabel, parentconcept = stack.pop() assert forgetme == PNODE if parentnodelabel[0] == '@': parentnodelabel = parentnodelabel[1:] amr.external_nodes.append(parentnodelabel) foo = amr[parentnodelabel] # add only the node if stack: stack.append((CNODE, parentnodelabel, parentconcept)) state = 6 else: amr.roots.append(parentnodelabel) state = 0 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 3: if type == LexTypes.IDENTIFIER: assert stack[-1][0] == PNODE nodelabel = stack.pop()[1] stack.append( (PNODE, nodelabel, token)) # Push new source node with concept label state = 4 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 4: if type == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) state = 5 elif type == LexTypes.RPAR: forgetme, parentnodelabel, parentconcept = stack.pop() assert forgetme == PNODE if parentnodelabel[0] == '@': parentnodelabel = parentnodelabel[1:] amr.external_nodes.append(parentnodelabel) foo = amr[parentnodelabel] # add only the node if concepts and ( not parentnodelabel in amr.node_to_concepts or parentnodelabel is not None): amr.node_to_concepts[parentnodelabel] = parentconcept if stack: stack.append((CNODE, parentnodelabel, parentconcept)) state = 6 else: amr.roots.append(parentnodelabel) state = 0 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 5: if type == LexTypes.LPAR: state = 1 elif type == LexTypes.QUANTITY: stack.append((CNODE, Quantity(token), None)) state = 6 elif type == LexTypes.STRLITERAL: stack.append((CNODE, StrLiteral(token[1:-1]), None)) state = 6 elif type == LexTypes.LITERAL: stack.append((CNODE, Literal(token[1:]), None)) state = 6 elif type == LexTypes.IDENTIFIER: stack.append( (CNODE, token, None)) # Push new source node with concept label state = 6 elif type == LexTypes.EDGELABEL: # Unary edge stack.append((CNODE, None, None)) stack.append((EDGE, token[1:])) state = 5 elif type == LexTypes.RPAR: # Unary edge stack.append((CNODE, None, None)) edges = [] while stack[-1][0] != PNODE: # Pop all edges children = [] while stack[-1][ 0] == CNODE: # Pop all external nodes for hyperedge forgetme, childnodelabel, childconcept = stack.pop( ) if childnodelabel is not None and childnodelabel[ 0] == '@': #child is external node childnodelabel = childnodelabel[1:] amr.external_nodes.append(childnodelabel) children.append((childnodelabel, childconcept)) assert stack[-1][0] == EDGE forgetme, edgelabel = stack.pop() edges.append((edgelabel, children)) forgetme, parentnodelabel, parentconcept = stack.pop() if concepts and ( not parentnodelabel in amr.node_to_concepts or parentconcept is not None): amr.node_to_concepts[parentnodelabel] = parentconcept if parentnodelabel[0] == '@': #parent is external node parentnodelabel = parentnodelabel[1:] amr.external_nodes.append(parentnodelabel) for edgelabel, children in edges: hypertarget = [] # build hyperedge destination for node, concept in children: if node is not None: if concepts and ( not node in amr.node_to_concepts or concept is not None): amr.node_to_concepts[node] = concept hypertarget.append(node) hyperchild = tuple(hypertarget) if edgelabel[0] == '#': # this is a nonterminal Edge edgelabel = NonterminalLabel(edgelabel[1:]) amr._add_triple(parentnodelabel, edgelabel, hyperchild) if stack: state = 6 stack.append((CNODE, parentnodelabel, parentconcept)) else: state = 0 amr.roots.append(parentnodelabel) else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 6: if type == LexTypes.RPAR: # Pop from stack and add edges edges = [] while stack[-1][0] != PNODE: # Pop all edges children = [] while stack[-1][ 0] == CNODE: # Pop all external nodes for hyperedge forgetme, childnodelabel, childconcept = stack.pop( ) if childnodelabel is not None and childnodelabel[ 0] == '@': #child is external node childnodelabel = childnodelabel[1:] amr.external_nodes.append(childnodelabel) children.append((childnodelabel, childconcept)) assert stack[-1][0] == EDGE forgetme, edgelabel = stack.pop() edges.append((edgelabel, children)) forgetme, parentnodelabel, parentconcept = stack.pop() if concepts and ( not parentnodelabel in amr.node_to_concepts or parentconcept is not None): amr.node_to_concepts[parentnodelabel] = parentconcept if parentnodelabel[0] == '@': #parent is external node parentnodelabel = parentnodelabel[1:] amr.external_nodes.append(parentnodelabel) for edgelabel, children in edges: hypertarget = [] # build hyperedge destination for node, concept in children: if node is not None: if concepts and ( not node in amr.node_to_concepts or concept is not None): amr.node_to_concepts[node] = concept hypertarget.append(node) hyperchild = tuple(hypertarget) if edgelabel[0] == '#': # this is a nonterminal Edge edgelabel = NonterminalLabel(edgelabel[1:]) amr._add_triple(parentnodelabel, edgelabel, hyperchild) if stack: state = 6 stack.append((CNODE, parentnodelabel, parentconcept)) else: state = 0 amr.roots.append(parentnodelabel) elif type == LexTypes.COMMA: state = 7 elif type == LexTypes.EDGELABEL: stack.append((EDGE, token[1:])) state = 5 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) elif state == 7: if type == LexTypes.IDENTIFIER: stack.append( (CNODE, token, None)) # Push new source node with concept label state = 6 elif type == LexTypes.LPAR: state = 1 else: raise ParserError, "Unexpected token %s at position %i." % ( token, pos) return amr