Exemple #1
0
def match_events_2(nodeT, nodeH):
    # Two event relation arguments in the same
    # argument position match if:
    lemmaT, posT, senseT = split(nodeT.predicate)
    lemmaH, posH, senseH = split(nodeH.predicate)

    # they are the same or synonymous, or the
    # Hevent argument is a hypernym of the Tevent
    # argument, or
    if lemmaT == lemmaH: return True
    if synonyms_or_hyperonyms(lemmaT, lemmaH):
        return True

    # the argument in Tevent represents a noun
    # phrase and te argument in Hevent is an
    # underspecified pronoun like somebody, or
    pass  # todo

    # the argument in Tevent is either a scopal
    # relation or a conjunction relation, and one
    # of its arguments matches that of Hevent, or
    pass  # todo

    # the argument in Hevent is not expressed
    # (i.e., it matches the Tevent argument by
    # default)
    pass  # todo

    return False
Exemple #2
0
def match_events_1(nodeT, nodeH):
    """
    Returns wether they represent the same lexeme
    with the same part-of-speech, or if both are verbs
    and Hevent is a synonym or hypernym of Tevent
    """

    if nodeT.predicate == nodeH.predicate: return True
    lpossT = split(nodeT.predicate)  # lemma, part-of-speech, sense
    lpossH = split(nodeH.predicate)  # lemma, part-of-speech, sense

    verbs = lpossT[1] == "v" and lpossH[1] == "v"
    s_or_h = synonyms_or_hyperonyms(lpossT[0], lpossH[0])

    return verbs and s_or_h
Exemple #3
0
def _encode_pred(pred):
    if predicate.is_surface(pred):
        lemma, pos, sense = predicate.split(pred)
        attributes = {'lemma': lemma, 'pos': pos}
        if sense:
            attributes['sense'] = sense
        e = etree.Element('realpred', attrib=attributes)
    else:
        e = etree.Element('gpred')
        e.text = pred
    return e
Exemple #4
0
def _encode_pred(pred):
    p = None
    if predicate.is_surface(pred):
        lemma, pos, sense = predicate.split(pred)
        attributes = {'lemma': lemma, 'pos': pos}
        if sense is not None:
            attributes['sense'] = sense
        p = etree.Element('realpred', attrib=attributes)
    elif predicate.is_abstract(pred):
        p = etree.Element('pred')
        p.text = pred
    else:
        p = etree.Element('spred')
        p.text = pred
    return p
Exemple #5
0
def match_surface_predicate_token(predicate, start_token_index,
                                  end_token_index, token_nodes,
                                  token_node_list):
    match_prob = []
    tkns = []
    lemma, pos, sense = d_predicate.split(predicate)
    if pos == 'u':
        lemma = lemma[:lemma.rindex('/')]
    for tok_index in range(start_token_index, end_token_index + 1):
        # id lookup bypasses the derivation node
        token = token_nodes[token_node_list[tok_index]]
        t_str = clean_token_lemma(token.token_str, predicate.isdigit())
        tkns.append(t_str)
        seq = difflib.SequenceMatcher(a=t_str, b=lemma.lower())
        match_prob.append(seq.ratio())
    token_match = start_token_index + match_prob.index(max(match_prob))
    if max(match_prob) == 0:
        print(predicate, tkns)

    return token_match
Exemple #6
0
    def process_semantic_tree(self, node_id, dmrs_rep, semantic_parent=-1):
        node = self.nodes[node_id]
        sem_node_ids = [snode.node_id for snode in node.semantic_nodes]
        remove_sem_nodes = []
        internal_edge_from = []  # semantic node ids
        internal_edge_to = []
        internal_edge_label = []

        if node.semantic_nodes:
            semantic_anchor = node_id
            node.semantic_parent_node = semantic_parent

            for edge in dmrs_rep.links:
                start_node_id = self.dmrs_node_map[edge.start]
                end_node_id = self.dmrs_node_map[edge.end]
                if end_node_id == node_id:
                    #start_id = sem_node_ids.index(edge.start)
                    end_id = sem_node_ids.index(edge.end)
                    sem_node = node.semantic_nodes[end_id]
                    if start_node_id == node_id:
                        # record internal edge
                        internal_edge_from.append(edge.start)
                        internal_edge_to.append(edge.end)
                        internal_edge_label.append(edge.role)
                        # previously recorded in the node, and test for non-chains
                    elif start_node_id == semantic_parent:
                        # record ancestor edge
                        self.nodes[node_id].semantic_nodes[
                            end_id].has_ancestor = True
                        #assert self.nodes[node_id].semantic_parent_edge_label == ""
                        self.nodes[
                            node_id].semantic_parent_edge_label = edge.role
                        parent_sem_node_ids = [
                            snode.node_id for snode in
                            self.nodes[semantic_parent].semantic_nodes
                        ]
                        parent_start_id = parent_sem_node_ids.index(edge.start)
                        self.nodes[semantic_parent].semantic_nodes[
                            parent_start_id].is_semantic_head = True

            # identify non-token-level surface predicates to move
            #   if the node has internal children, don't move
            for sid, sem_node in enumerate(node.semantic_nodes):
                if (not node.isToken
                    ) and sem_node.node_id not in internal_edge_from:
                    token_index = -1
                    if d_predicate.is_surface(sem_node.original_predicate):
                        token_index = match_surface_predicate_token(
                            sem_node.original_predicate,
                            node.start_token_index, node.end_token_index,
                            self.token_nodes, self.token_node_list)
                    elif sem_node.carg is not None:
                        token_index = match_surface_predicate_token(
                            sem_node.carg, node.start_token_index,
                            node.end_token_index, self.token_nodes,
                            self.token_node_list)

                    if token_index >= 0:
                        token_id = self.token_node_list[token_index]
                        new_preterminal = self.token_preterminal_node_map[
                            token_id]
                        self.nodes[new_preterminal].semantic_nodes.append(
                            sem_node)
                        self.dmrs_node_map[sem_node.node_id] = new_preterminal
                        remove_sem_nodes.append(sid)
                        # follow the chain
                        # for some quantifiers, might be indended to span everything, but this seems good enough for now
                        snode_id = sem_node.node_id
                        while snode_id in internal_edge_to:
                            new_snode_id = -1
                            for edge_i, parent_node_id in enumerate(
                                    internal_edge_from):
                                if internal_edge_to[
                                        edge_i] == snode_id and internal_edge_from.count(
                                            parent_node_id) == 1:
                                    sid = sem_node_ids.index(parent_node_id)
                                    sem_node = node.semantic_nodes[sid]
                                    self.nodes[
                                        new_preterminal].semantic_nodes.append(
                                            sem_node)

                                    self.dmrs_node_map[
                                        sem_node.node_id] = new_preterminal
                                    remove_sem_nodes.append(sid)
                                    if parent_node_id in internal_edge_to:
                                        #if new_snode_id >= 0: # almost never have 2 internal parents
                                        new_snode_id = parent_node_id
                            snode_id = new_snode_id

        else:
            semantic_anchor = semantic_parent

        for i in sorted(remove_sem_nodes, reverse=True):
            del node.semantic_nodes[i]

        # if current node is an overlapping node and it has nodes left, send to the spanning parent
        # (if all the arguments of the node is covered by one of the children, should ideally send down, but not now)
        if node.node_id in self.overlapping_node_map and len(
                node.semantic_nodes) > 0:
            parent_node_id = self.overlapping_node_map[node.node_id]
            for i in range(len(node.semantic_nodes) - 1, -1, -1):
                self.nodes[parent_node_id].semantic_nodes.append(
                    node.semantic_nodes[i])
                del node.semantic_nodes[i]

        for child_id in node.overlapping_node_ids:
            self.process_semantic_tree(child_id, dmrs_rep, semantic_anchor)

        # For token (preterminal) nodes, extract lemmas from predicates
        if node.isToken:
            if len(node.token_ids) == 1:
                tok = self.token_nodes[node.token_ids[0]]
                best_lemma_match_prob = 0.0
                best_sid = -1
                best_pred = ""
                t_str = clean_token_lemma(tok.token_str)
                for sid, sem_node in enumerate(node.semantic_nodes):
                    if d_predicate.is_surface(sem_node.original_predicate):
                        sem_node.is_surface = True
                        lemma, pos, sense = d_predicate.split(
                            sem_node.original_predicate)
                        pred = "_" + ("_".join([pos, sense])
                                      if sense is not None else pos)
                        seq = difflib.SequenceMatcher(a=lemma, b=t_str)
                        lemma_match_prob = seq.ratio()
                        if tok.lemma == "" or lemma_match_prob > best_lemma_match_prob:
                            tok.lemma = lemma
                            best_sid = sid
                            best_pred = pred
                            best_lemma_match_prob = lemma_match_prob
                        if pred == "_u_unknown":
                            if "/" in lemma:
                                tok.lemma = lemma[:lemma.rindex("/")]
                                sem_node.original_predicate = "_" + tok.lemma + pred
                            tok.is_unknown = True
                    if sem_node.carg is not None:
                        sem_node.is_surface = True
                        if tok.carg == "":
                            tok.carg = sem_node.carg
                        # For multiple CARGs, just take first one as heuristic
                if tok.carg != "":
                    if tok.lemma == "":
                        tok.lemma = tok.carg
                    else:
                        t_str = clean_token_lemma(tok.token_str, True)
                        seq = difflib.SequenceMatcher(a=tok.carg, b=t_str)
                        carg_match_prob = seq.ratio()
                        if carg_match_prob > best_lemma_match_prob:
                            tok.lemma = tok.carg
                            best_lemma_match_prob = carg_match_prob
                #if best_lemma_match_prob < 0.5 and tok.lemma != "" and tok.lemma != tok.carg:
                #    print(tok.lemma, tok.token_str)
                if best_sid >= 0 and tok.lemma != tok.carg:
                    node.semantic_nodes[best_sid].predicate = best_pred
                    node.semantic_nodes[best_sid].lemma = tok.lemma
            elif len(node.token_ids) > 1:
                matched_multi = False
                for sem_node in node.semantic_nodes:
                    if d_predicate.is_surface(sem_node.original_predicate):
                        sem_node.is_surface = True
                        lemma, pos, sense = d_predicate.split(
                            sem_node.original_predicate)
                        if "-" in lemma:
                            lemma_split = lemma.split("-")
                            lemma_split[0] += "-"
                        else:
                            #TODO "awhile"
                            lemma_split = lemma.split("+")
                        if len(lemma_split) == len(node.token_ids):
                            pred = "_" + ("_".join([pos, sense])
                                          if sense is not None else pos)
                            sem_node.predicate = pred
                            sem_node.lemma = lemma
                            for i, tok_id in enumerate(node.token_ids):
                                tok = self.token_nodes[tok_id]
                                tok.lemma = lemma_split[i]

                            matched_multi = True
                            break
                    #TODO match the carg if there is one

                if matched_multi:
                    tokstr = [
                        self.token_nodes[tok_id].token_str
                        for tok_id in node.token_ids
                    ]
                    semstr = [
                        sem_node.original_predicate
                        for sem_node in node.semantic_nodes
                    ]
                    #print("matched", node.token_form, tokstr, semstr)

        for child_id in node.child_node_ids:
            self.process_semantic_tree(child_id, dmrs_rep, semantic_anchor)