def main(): cconv = Conversion() he = head() a_ontonotes,head_trees_path = cconv.loadOntonotes() #head_rules = he.loadHeadrules("/nfs/guest/yaqiny/Dropbox/Code/OntonotesUtil/ontonotes-db-tool-v0.999b/data/headrules.txt") #head_rules = he.loadHeadrules("/home/j/llc/cwang24/R_D/AMR/ontonotes-db-tool-v0.999b/data/headrules.txt") head_trees = he.loadHeadTrees(head_trees_path) cconv.convertTrees(a_ontonotes,head_trees)
def convert(self, head_trees, a_tree, amr): """ recursively convert a tree/subtree into amr """ #if a_tree.tag == 'DNP': # import pdb # pdb.set_trace() def getDNPType(a_tr,h_id,sc_id): """DNP types """ a_DNP_tree = a_tr.children[sc_id] tag_pattern = '+'.join(x.tag.split('-')[0] for x in a_DNP_tree.children) return tag_pattern def isPredicate(a_tr,h_idx): """check whether the current verb head is a predicate in propbank""" relative_idx = h_idx - a_tr.get_token_index() head_node = a_tr[relative_idx] if head_node.tag == 'VV': import pdb pdb.set_trace() return head_node.proposition != None else: return True def fix_predicate(amr,a_tree,h_idx): """fix the current amr by eliminating the non-predicate (in propbank) head here maybe a little confused since the predicates in propbank are not always the head in the treebank. """ pass def convertCompVerb(h_child,amr,h_idx): """combine VSB's two VV as one""" del amr[h_idx] foo = amr[h_idx] amr.node_to_concepts[h_idx] = ''.join(x.word for x in h_child.children) def isVPCoordination(a_tr): """VP(VP(...)PU(,)VP(...))""" if a_tr.tag == 'VP' and re.match('VPPUVP',''.join(x.tag for x in a_tr.children)): return True else: return False def recover_subject(amr,head_token_idx): arg0 = None for op in amr[head_token_idx].values(): subject = amr[op[0]].itemsfor('ARG0') if subject != []: arg0 = subject[0][1] for vp in amr[head_token_idx].values(): subject = amr[vp[0]].itemsfor('ARG0') if subject == []: amr._add_triple(vp[0],'ARG0',arg0) he = head() modifier_structs,head_child_id = he.getHeadNodeTree(head_trees, a_tree) #propbank args and functional tags for current tree node/leaf aftags = {} pred_args = self.get_edge_labels(a_tree) rel = None ftag = None if "PRED-ARG" in pred_args: rel = pred_args["PRED-ARG"] if "FUNCT" in pred_args: ftag = pred_args["FUNCT"] if not rel and ftag: rel = ftag if a_tree.is_leaf(): #if a_tree.word in [u'\u3002',u'\u300a',u'\uff01']: #punctuation # return (None,) token_idx = a_tree.get_token_index() if a_tree.tag in ['PU','CS','SP','DEC','AS']: return (-1, token_idx) foo = amr[token_idx] #add only the node with no children amr.node_to_concepts[token_idx] = a_tree.word #update the node_id->concept mapping return (rel,token_idx) else: head_rel, head_token_idx = self.convert(head_trees, a_tree.children[head_child_id], amr) opNum = 1 head_child = a_tree.children[head_child_id] # deal with compound verb if head_child.tag in ['VSB','VRD','VCD','VNV']: convertCompVerb(head_child,amr,head_token_idx) if head_rel == -1: if head_child.tag == 'PU': # cannot drop conjunction head: the comma foo = amr[head_token_idx] else: rel = head_rel # coreference if a_tree.coreference_link and a_tree.coreference_chain.type == 'IDENT': self.coref_chains[a_tree.coreference_chain.id].append((head_token_idx,a_tree.coreference_link)) if ftag and 'PN' in ftag.split('-'): return self.convertPN(a_tree,amr,rel,head_token_idx) if modifier_structs == {}: for child_id, child in enumerate(a_tree.children): if child_id != head_child_id: child_rel,child_token_idx = self.convert(head_trees, child, amr) if child_rel == None: amr._add_triple(head_token_idx, "NA", child_token_idx) elif isinstance(child_rel,list): for pred,arg in child_rel: amr._add_triple(pred, arg, child_token_idx) elif isinstance(child_rel,unicode): if child_rel == u'OBJ': child_rel = 'ARG1' elif child_rel == u'SBJ' or child_rel.split('-')[-1] == u'SBJ': child_rel = 'ARG0' amr._add_triple(head_token_idx, child_rel, child_token_idx) elif child_rel == -1: continue else: print a_tree.pretty_print() raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__)) if head_rel == 'VPCOORD': recover_subject(amr,head_token_idx) elif 'CONJUNCTION' in modifier_structs: assert len(modifier_structs) == 1 for child_id, child in enumerate(a_tree.children): if child_id != head_child_id: child_rel,child_token_idx = self.convert(head_trees, child, amr) # if child_rel == PN convertPN print child_rel,child_token_idx if child_rel == -1: # punctuation continue elif isinstance(child_rel,list): for pred,arg in child_rel: amr._add_triple(pred, arg, child_token_idx) elif isinstance(child_rel,unicode): pass elif child_rel == None or child_rel == -1 or child_rel == 'PN' or child_rel == 'Q': pass else: raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__)) amr._add_triple(head_token_idx, "op"+str(opNum), child_token_idx) amr.node_to_concepts[head_token_idx] = 'and' opNum+=1 if isVPCoordination(a_tree): assert rel == None rel = 'VPCOORD' else: # current tree contains various kind of adjunction subtree spec_id_list = [] # first deal with the adjunct subtrees for modifier in modifier_structs: spec_child_id = modifier_structs[modifier] spec_id_list.append(spec_child_id) if modifier == 'RELATIVE-CLAUSE': spec_child = a_tree.children[spec_child_id] spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr) if amr.node_to_concepts[spec_child_token_idx] == 'and': # conjunction for relation,op in amr[spec_child_token_idx].items(): if relation.startswith('op'): self.convertRC(amr,a_tree,op[0],head_token_idx) else: self.convertRC(amr,a_tree,spec_child_token_idx,head_token_idx) elif modifier == 'CND': spec_child = a_tree.children[spec_child_id] spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr) amr._add_triple(head_token_idx, 'condition', spec_child_token_idx) elif modifier == 'BA': spec_child = a_tree.children[spec_child_id] # here we simply don't add the BA elif modifier == 'DNP-NMOD': assert spec_child_id != -1 DNP_TYPE = getDNPType(a_tree,head_child_id,spec_child_id) # different categories of complements in DNP if DNP_TYPE == 'PP+DEG': spec_child = a_tree.children[spec_child_id] spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr) assert spec_child_rel == None assert spec_child.children[-1].tag == 'DEG' rel_child_pairs = amr[spec_child_token_idx].items() assert len(rel_child_pairs) == 1 pp_rel = rel_child_pairs[0][0] pp_idx = rel_child_pairs[0][1] del amr[spec_child_token_idx] amr._add_triple(head_token_idx,pp_rel,pp_idx) for poss_id in [spec_child_id - 1,spec_child_id + 1]: if poss_id >= 0 and poss_id < len(a_tree.children) and poss_id != head_child_id: poss_child = a_tree.children[poss_id] poss_child_rel,poss_child_token_idx = self.convert(head_trees,poss_child, amr) assert poss_child_rel == None amr._add_triple(head_token_idx,'poss',poss_child_token_idx) spec_id_list.append(poss_id) elif DNP_TYPE in ['NP+DEG','DP+DEG','ADJP+DEG']: spec_child = a_tree.children[spec_child_id] spec_child_rel,spec_child_token_idx = self.convert(head_trees,spec_child, amr) assert spec_child_rel == None assert spec_child.children[-1].tag == 'DEG' rel_child_pairs = amr[spec_child_token_idx].items() assert len(rel_child_pairs) == 1 xp_rel = rel_child_pairs[0][0] # if xp_rel == 'PN': convertPN assert xp_rel == None or xp_rel == -1 or xp_rel == 'PN' or xp_rel == 'NA' xp_idx = rel_child_pairs[0][1] del amr[spec_child_token_idx] amr._add_triple(head_token_idx,'mod',xp_idx) elif modifier.startswith('PP'): # preposition PP_TYPE = modifier.split('-')[-1] if spec_child_id != head_child_id: spec_child = a_tree.children[spec_child_id] spec_child_rel,spec_child_token_idx = self.convert(head_trees,spec_child, amr) # print spec_child_rel # NEED CHECK here:assert spec_child_rel == None # get complement of preposition #print spec_child.pretty_print() #print spec_child_token_idx #print amr[spec_child_token_idx].values() relative_token_idx = spec_child_token_idx - spec_child.get_token_index() if spec_child[relative_token_idx].tag == 'P': # only one node(complement) follow preposition # assert len(amr[spec_child_token_idx].values()) == 1 or len(amr[spec_child_token_idx].values()) == 0 if len(amr[spec_child_token_idx].values()) == 0: continue complement_idx = amr[spec_child_token_idx].values()[-1] del amr[spec_child_token_idx] # here we eliminate the prepoision within PP replace it with :prep-x amr._add_triple(head_token_idx,'prep-'+PP_TYPE,complement_idx) else: # add the child as normal spec_id_list.pop() else: relative_token_idx = head_token_idx - head_child.get_token_index() if head_child[relative_token_idx].tag == 'P': # only one node(complement) follow preposition assert len(amr[head_token_idx].values()) == 1 or len(amr[head_token_idx].values()) == 0 if len(amr[head_token_idx].values()) == 0: continue complement_idx = amr[head_token_idx].values()[0] del amr[head_token_idx] # here we eliminate the prepoision within PP replace it with :prep-x head_token_idx = complement_idx[0] PP_TYPE = head_child.tag.split('-')[-1] rel = 'prep-'+PP_TYPE else: spec_id_list.pop() else: raise TypeError('Wrong modifier_type: %s'%(modifier)) # add other relation subtrees for child_id, child in enumerate(a_tree.children): if child_id != head_child_id and child_id not in spec_id_list: child_rel,child_token_idx = self.convert(head_trees, child, amr) if child_rel == None: amr._add_triple(head_token_idx, "NA", child_token_idx) elif isinstance(child_rel,list): for pred,arg in child_rel: amr._add_triple(pred, arg, child_token_idx) elif isinstance(child_rel,unicode): if child_rel == u'OBJ': child_rel = 'ARG1' elif child_rel == u'SBJ': child_rel = 'ARG0' amr._add_triple(head_token_idx, child_rel, child_token_idx) elif child_rel == -1: continue else: raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__)) # fix logistic subject if 'prep-LGS' in amr[head_token_idx] and amr[head_token_idx].itemsfor('ARG0') != []: amr[head_token_idx].replace('ARG0',amr[head_token_idx]['prep-LGS']) del amr[head_token_idx]['prep-LGS'] return rel, head_token_idx