def match_events_2(nodeT, nodeH): # Two event relation arguments in the same # argument position match if: lemmaT, posT, senseT = split(nodeT.predicate) lemmaH, posH, senseH = split(nodeH.predicate) # they are the same or synonymous, or the # Hevent argument is a hypernym of the Tevent # argument, or if lemmaT == lemmaH: return True if synonyms_or_hyperonyms(lemmaT, lemmaH): return True # the argument in Tevent represents a noun # phrase and te argument in Hevent is an # underspecified pronoun like somebody, or pass # todo # the argument in Tevent is either a scopal # relation or a conjunction relation, and one # of its arguments matches that of Hevent, or pass # todo # the argument in Hevent is not expressed # (i.e., it matches the Tevent argument by # default) pass # todo return False
def match_events_1(nodeT, nodeH): """ Returns wether they represent the same lexeme with the same part-of-speech, or if both are verbs and Hevent is a synonym or hypernym of Tevent """ if nodeT.predicate == nodeH.predicate: return True lpossT = split(nodeT.predicate) # lemma, part-of-speech, sense lpossH = split(nodeH.predicate) # lemma, part-of-speech, sense verbs = lpossT[1] == "v" and lpossH[1] == "v" s_or_h = synonyms_or_hyperonyms(lpossT[0], lpossH[0]) return verbs and s_or_h
def _encode_pred(pred): if predicate.is_surface(pred): lemma, pos, sense = predicate.split(pred) attributes = {'lemma': lemma, 'pos': pos} if sense: attributes['sense'] = sense e = etree.Element('realpred', attrib=attributes) else: e = etree.Element('gpred') e.text = pred return e
def _encode_pred(pred): p = None if predicate.is_surface(pred): lemma, pos, sense = predicate.split(pred) attributes = {'lemma': lemma, 'pos': pos} if sense is not None: attributes['sense'] = sense p = etree.Element('realpred', attrib=attributes) elif predicate.is_abstract(pred): p = etree.Element('pred') p.text = pred else: p = etree.Element('spred') p.text = pred return p
def match_surface_predicate_token(predicate, start_token_index, end_token_index, token_nodes, token_node_list): match_prob = [] tkns = [] lemma, pos, sense = d_predicate.split(predicate) if pos == 'u': lemma = lemma[:lemma.rindex('/')] for tok_index in range(start_token_index, end_token_index + 1): # id lookup bypasses the derivation node token = token_nodes[token_node_list[tok_index]] t_str = clean_token_lemma(token.token_str, predicate.isdigit()) tkns.append(t_str) seq = difflib.SequenceMatcher(a=t_str, b=lemma.lower()) match_prob.append(seq.ratio()) token_match = start_token_index + match_prob.index(max(match_prob)) if max(match_prob) == 0: print(predicate, tkns) return token_match
def process_semantic_tree(self, node_id, dmrs_rep, semantic_parent=-1): node = self.nodes[node_id] sem_node_ids = [snode.node_id for snode in node.semantic_nodes] remove_sem_nodes = [] internal_edge_from = [] # semantic node ids internal_edge_to = [] internal_edge_label = [] if node.semantic_nodes: semantic_anchor = node_id node.semantic_parent_node = semantic_parent for edge in dmrs_rep.links: start_node_id = self.dmrs_node_map[edge.start] end_node_id = self.dmrs_node_map[edge.end] if end_node_id == node_id: #start_id = sem_node_ids.index(edge.start) end_id = sem_node_ids.index(edge.end) sem_node = node.semantic_nodes[end_id] if start_node_id == node_id: # record internal edge internal_edge_from.append(edge.start) internal_edge_to.append(edge.end) internal_edge_label.append(edge.role) # previously recorded in the node, and test for non-chains elif start_node_id == semantic_parent: # record ancestor edge self.nodes[node_id].semantic_nodes[ end_id].has_ancestor = True #assert self.nodes[node_id].semantic_parent_edge_label == "" self.nodes[ node_id].semantic_parent_edge_label = edge.role parent_sem_node_ids = [ snode.node_id for snode in self.nodes[semantic_parent].semantic_nodes ] parent_start_id = parent_sem_node_ids.index(edge.start) self.nodes[semantic_parent].semantic_nodes[ parent_start_id].is_semantic_head = True # identify non-token-level surface predicates to move # if the node has internal children, don't move for sid, sem_node in enumerate(node.semantic_nodes): if (not node.isToken ) and sem_node.node_id not in internal_edge_from: token_index = -1 if d_predicate.is_surface(sem_node.original_predicate): token_index = match_surface_predicate_token( sem_node.original_predicate, node.start_token_index, node.end_token_index, self.token_nodes, self.token_node_list) elif sem_node.carg is not None: token_index = match_surface_predicate_token( sem_node.carg, node.start_token_index, node.end_token_index, self.token_nodes, self.token_node_list) if token_index >= 0: token_id = self.token_node_list[token_index] new_preterminal = self.token_preterminal_node_map[ token_id] self.nodes[new_preterminal].semantic_nodes.append( sem_node) self.dmrs_node_map[sem_node.node_id] = new_preterminal remove_sem_nodes.append(sid) # follow the chain # for some quantifiers, might be indended to span everything, but this seems good enough for now snode_id = sem_node.node_id while snode_id in internal_edge_to: new_snode_id = -1 for edge_i, parent_node_id in enumerate( internal_edge_from): if internal_edge_to[ edge_i] == snode_id and internal_edge_from.count( parent_node_id) == 1: sid = sem_node_ids.index(parent_node_id) sem_node = node.semantic_nodes[sid] self.nodes[ new_preterminal].semantic_nodes.append( sem_node) self.dmrs_node_map[ sem_node.node_id] = new_preterminal remove_sem_nodes.append(sid) if parent_node_id in internal_edge_to: #if new_snode_id >= 0: # almost never have 2 internal parents new_snode_id = parent_node_id snode_id = new_snode_id else: semantic_anchor = semantic_parent for i in sorted(remove_sem_nodes, reverse=True): del node.semantic_nodes[i] # if current node is an overlapping node and it has nodes left, send to the spanning parent # (if all the arguments of the node is covered by one of the children, should ideally send down, but not now) if node.node_id in self.overlapping_node_map and len( node.semantic_nodes) > 0: parent_node_id = self.overlapping_node_map[node.node_id] for i in range(len(node.semantic_nodes) - 1, -1, -1): self.nodes[parent_node_id].semantic_nodes.append( node.semantic_nodes[i]) del node.semantic_nodes[i] for child_id in node.overlapping_node_ids: self.process_semantic_tree(child_id, dmrs_rep, semantic_anchor) # For token (preterminal) nodes, extract lemmas from predicates if node.isToken: if len(node.token_ids) == 1: tok = self.token_nodes[node.token_ids[0]] best_lemma_match_prob = 0.0 best_sid = -1 best_pred = "" t_str = clean_token_lemma(tok.token_str) for sid, sem_node in enumerate(node.semantic_nodes): if d_predicate.is_surface(sem_node.original_predicate): sem_node.is_surface = True lemma, pos, sense = d_predicate.split( sem_node.original_predicate) pred = "_" + ("_".join([pos, sense]) if sense is not None else pos) seq = difflib.SequenceMatcher(a=lemma, b=t_str) lemma_match_prob = seq.ratio() if tok.lemma == "" or lemma_match_prob > best_lemma_match_prob: tok.lemma = lemma best_sid = sid best_pred = pred best_lemma_match_prob = lemma_match_prob if pred == "_u_unknown": if "/" in lemma: tok.lemma = lemma[:lemma.rindex("/")] sem_node.original_predicate = "_" + tok.lemma + pred tok.is_unknown = True if sem_node.carg is not None: sem_node.is_surface = True if tok.carg == "": tok.carg = sem_node.carg # For multiple CARGs, just take first one as heuristic if tok.carg != "": if tok.lemma == "": tok.lemma = tok.carg else: t_str = clean_token_lemma(tok.token_str, True) seq = difflib.SequenceMatcher(a=tok.carg, b=t_str) carg_match_prob = seq.ratio() if carg_match_prob > best_lemma_match_prob: tok.lemma = tok.carg best_lemma_match_prob = carg_match_prob #if best_lemma_match_prob < 0.5 and tok.lemma != "" and tok.lemma != tok.carg: # print(tok.lemma, tok.token_str) if best_sid >= 0 and tok.lemma != tok.carg: node.semantic_nodes[best_sid].predicate = best_pred node.semantic_nodes[best_sid].lemma = tok.lemma elif len(node.token_ids) > 1: matched_multi = False for sem_node in node.semantic_nodes: if d_predicate.is_surface(sem_node.original_predicate): sem_node.is_surface = True lemma, pos, sense = d_predicate.split( sem_node.original_predicate) if "-" in lemma: lemma_split = lemma.split("-") lemma_split[0] += "-" else: #TODO "awhile" lemma_split = lemma.split("+") if len(lemma_split) == len(node.token_ids): pred = "_" + ("_".join([pos, sense]) if sense is not None else pos) sem_node.predicate = pred sem_node.lemma = lemma for i, tok_id in enumerate(node.token_ids): tok = self.token_nodes[tok_id] tok.lemma = lemma_split[i] matched_multi = True break #TODO match the carg if there is one if matched_multi: tokstr = [ self.token_nodes[tok_id].token_str for tok_id in node.token_ids ] semstr = [ sem_node.original_predicate for sem_node in node.semantic_nodes ] #print("matched", node.token_form, tokstr, semstr) for child_id in node.child_node_ids: self.process_semantic_tree(child_id, dmrs_rep, semantic_anchor)