コード例 #1
0
ファイル: sieveDummy.py プロジェクト: andreasvc/groref
def sieveDummy(mention_id_list, mention_dict, cluster_dict, cluster_id_list):
    for idx, mention_id in enumerate(mention_id_list):
        if (
                idx % 2 == 1
        ):  # Link every second mention with the mention 1 position back in the list
            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                mention_id_list[idx],
                mention_id_list[idx - 1],
                mention_dict,
                cluster_dict,
                cluster_id_list,
            )
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list
コード例 #2
0
def sievePreciseConstructs(mention_id_list, mention_dict, cluster_dict,
                           cluster_id_list, verbosity):
    mention_ids_per_sentence = get_mention_id_list_per_sentence(
        mention_id_list, mention_dict)
    if verbosity == 'high':
        print('Applying Precise Constructs...')
    for cluster_id in cluster_id_list[:]:
        madeLink = False
        cluster = cluster_dict[cluster_id]
        # Only consider first mention in cluster for resolution
        anaphor = mention_dict[cluster.mentionList[0]]
        # Cycle through sentences backwards, but through mentions within a
        # sentence forwards
        for sent_id in range(anaphor.sentNum, 0, -1):
            if madeLink:
                break
            if sent_id in mention_ids_per_sentence:  # Not empty
                for candidate_mention_id in mention_ids_per_sentence[sent_id]:
                    if madeLink:
                        break
                    # Don't look ahead of anaphor
                    if candidate_mention_id == anaphor.ID:
                        break
                    candidate_cluster = cluster_dict[
                        mention_dict[candidate_mention_id].clusterID]
                    for ment_id in candidate_cluster.mentionList:
                        root = mention_dict[ment_id].tree.getroot()
                        # print(mention_dict[ment_id].tokenAttribs,
                        #       mention_dict[anaphor.ID].headWords,
                        #       mention_dict[anaphor.ID].tokenAttribs)
                        # Appositive: link two mentions if they are in an
                        # appositive contstructions.
                        appos = appositive(root)
                        if (appos and
                                len(mention_dict[anaphor.ID].tokenList) > 2):
                            for ids in mention_dict[ment_id].tokenAttribs:
                                if (ids['id'], ids['root']) not in appos[0]:
                                    continue
                                for ana_ids in mention_dict[
                                        anaphor.ID].tokenAttribs:
                                    if (ana_ids['id'],
                                            ana_ids['root']) in appos[1]:
                                        madeLink = True
                                        break
                                if madeLink:
                                    break
                        # Predicate nominative: Two mentions (nominal or
                        # pronominal) are in a copulative subject-object
                        # relation
                        pred = pred_nom(root)
                        if pred:
                            for ids in mention_dict[ment_id].tokenAttribs:
                                if (ids['id'], ids['root']) not in pred[0]:
                                    continue
                                for ana_ids in mention_dict[
                                        anaphor.ID].tokenAttribs:
                                    if (ana_ids['id'],
                                            ana_ids['root']) in pred[1]:
                                        madeLink = True
                                        break
                                if madeLink:
                                    break
                        # Role appositive
                        # if mention_dict[anaphor.ID].NEtype == 'person' and  mention_dict[ment_id].animacy == 'animate':# and mention_dict[anaphor.ID].gender != 'neuter':
                        # 	hw = mention_dict[ment_id].headWords + mention_dict[anaphor.ID].tokenList
                        # 	h =  mention_dict[ment_id].headWords
                        # 	t = []
                        # 	for hw in mention_dict[anaphor.ID].tokenList:
                        # 		h.append(hw.lower())
                        # 	for tk in mention_dict[ment_id].tokenList:
                        # 		t.append(tk.lower())
                        # 	if h == t:
                        # 		madeLink == True
                        # 		print "Role Appositive"

                        # Relative Pronoun: mention is a relative pronoun that
                        # modifies the head of the antecedent NP
                        if mention_dict[anaphor.ID].pron_type == 'betr':
                            rp = get_rel_pron(
                                root, mention_dict[anaphor.ID].tokenAttribs[0])
                            if len(rp) > 0:
                                for at in mention_dict[ment_id].tokenAttribs:
                                    if rp[0][0]['id'] == at['id']:
                                        madeLink = True
                                        break
                        # Acronym: Both mentions are tagged as NNP and one of
                        # them is an acronym of the other
                        mention_acr = acronyms(mention_dict[ment_id].tokenList)
                        anaphor_acr = acronyms(
                            mention_dict[anaphor.ID].tokenList)
                        if (mention_dict[ment_id].type == 'name'
                                and mention_dict[anaphor.ID].type == 'name'):
                            if (mention_acr
                                    in mention_dict[anaphor.ID].tokenList
                                    or anaphor_acr
                                    in mention_dict[ment_id].tokenList):
                                madeLink = True
                        # Demonym: one of the mentions is a demonym of the other
                        if len(mention_dict[ment_id].tokenList) == 1:
                            if mention_dict[ment_id].tokenList[0] in Demo:
                                if (Demo[mention_dict[ment_id].tokenList[0]] ==
                                        mention_dict[anaphor.ID].tokenList[0]):
                                    madeLink = True
                        if len(mention_dict[anaphor.ID].tokenList) == 1:
                            if mention_dict[anaphor.ID].tokenList[0] in Demo:
                                if (Demo[mention_dict[anaphor.ID].tokenList[0]]
                                        == mention_dict[ment_id].tokenList[0]):
                                    madeLink = True
                        if madeLink:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' %
                                      (ment_id, anaphor.ID))
                            (cluster_dict,
                             cluster_id_list) = mergeClustersByMentionIDs(
                                 ment_id, anaphor.ID, mention_dict,
                                 cluster_dict, cluster_id_list)
                            break
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list
コード例 #3
0
ファイル: sieveStringMatch.py プロジェクト: andreasvc/groref
def sieveStringMatch(mention_id_list, mention_dict, cluster_dict,
                     cluster_id_list, verbosity):
    mention_ids_per_sentence = get_mention_id_list_per_sentence(
        mention_id_list, mention_dict)
    if verbosity == 'high':
        print('Doing exact + relaxed string matching...')
    for cluster_id in cluster_id_list[:]:
        ExactEntityMatch = False
        RelaxedEntityMatch = False
        madeLink = False
        if madeLink:
            continue
        cluster = cluster_dict[cluster_id]
        anaphor = mention_dict[cluster.mentionList[
            0]]  # Only consider first mention in cluster for resolution
        anaphor_relaxedtokenlist = []
        for token in anaphor.tokenList:
            if len(anaphor.headWords) > 0:
                if token == anaphor.headWords[0]:
                    anaphor_relaxedtokenlist.append(token.lower())
                    break
                else:
                    anaphor_relaxedtokenlist.append(token.lower())
        for sent_id in range(
                anaphor.sentNum, 0, -1
        ):  # Cycle through sentences backwards, but through mentions within a sentence forwards
            if madeLink:
                break
            if sent_id in mention_ids_per_sentence:  # Not empty
                for candidate_mention_id in mention_ids_per_sentence[sent_id]:
                    if madeLink:
                        break
                    if (candidate_mention_id == anaphor.ID
                        ):  # Don't look ahead of anaphor
                        break
                    candidate_cluster = cluster_dict[
                        mention_dict[candidate_mention_id].clusterID]
                    for ment_id in candidate_cluster.mentionList:
                        '''Exact Match'''
                        if mention_dict[
                                ment_id].tokenList == anaphor.tokenList:
                            ExactEntityMatch = True
                        '''Relaxed Match'''
                        ment_relaxedtokenlist = []
                        for token in mention_dict[ment_id].tokenList:
                            if len(mention_dict[ment_id].headWords) > 0:
                                if token == mention_dict[ment_id].headWords[0]:
                                    ment_relaxedtokenlist.append(token.lower())
                                    break
                                else:
                                    ment_relaxedtokenlist.append(token.lower())
                        if len(ment_relaxedtokenlist) > 0:
                            if (ment_relaxedtokenlist ==
                                    anaphor_relaxedtokenlist):
                                RelaxedEntityMatch = True
                        if ExactEntityMatch or RelaxedEntityMatch:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                    ment_id,
                                    anaphor.ID,
                                ))
                            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                                ment_id,
                                anaphor.ID,
                                mention_dict,
                                cluster_dict,
                                cluster_id_list,
                            )
                            madeLink = True
                        if madeLink:
                            break
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list
コード例 #4
0
def sieveSpeakerIdentification(mention_id_list, mention_dict, cluster_dict,
        cluster_id_list, verbosity):
    mention_ids_per_sentence = get_mention_id_list_per_sentence(
        mention_id_list, mention_dict)
    if verbosity == 'high':
        print('Speaker identification...')
    for cluster_id in cluster_id_list[:]:
        madeLink = False
        # Initialize linking constraints here
        if madeLink:
            continue
        cluster = cluster_dict[cluster_id]
        # Only consider first mention in cluster for resolution
        anaphor = mention_dict[cluster.mentionList[0]]
        # Check for constraints on the anaphor here
        # Cycle through sentences backwards, but through mentions within a
        # sentence forwards
        for sent_id in range(anaphor.sentNum, 0, -1):
            if madeLink:
                break
            if sent_id in mention_ids_per_sentence:  # Not empty
                for candidate_mention_id in mention_ids_per_sentence[sent_id]:
                    if madeLink:
                        break
                    # Don't look ahead of anaphor
                    if candidate_mention_id == anaphor.ID:
                        break
                    candidate_cluster = cluster_dict[
                            mention_dict[candidate_mention_id].clusterID]
                    # search for subjects of reporting verbs
                    for ment_id in candidate_cluster.mentionList:
                        root = mention_dict[ment_id].tree.getroot()
                        anaphor_root = mention_dict[anaphor.ID].tree.getroot()
                        su_list = ['su', 'mwu_su', 'np_su']
                        I = ['ik', 'mij', 'me', 'mijn']
                        We = ['wij', 'ons', 'onze']
                        You = ['jij', 'je', 'jullie', 'U', 'u']
                        Subject = False
                        if mention_dict[ment_id].type in su_list:
                            Subject = True
                        if Subject:
                            anaphorlist = []
                            sub_verb = find_sub(root)
                            for subj in mention_dict[ment_id].tokenAttribs:
                                for su in sub_verb:
                                    if (subj['id'] == su.attrib['id']
                                            and subj['root']
                                            == su.attrib['root']):
                                        for node in anaphor_root.iter('node'):
                                            at = node.attrib
                                            if ('pos' in at
                                                    and at['pos'] == 'pron'):
                                                anaphorlist.append(at)
                            for attrib in anaphor.tokenAttribs:
                                for pron in anaphorlist:
                                    if attrib['id'] == pron['id']:
                                        if attrib['root'] in I:
                                            madeLink = True
                                        elif attrib['root'] in You:
                                            madeLink = True
                                        elif attrib['root'] in We:
                                            madeLink = True
                        # Check things against other mentions in the candidate
                        # cluster here, if necessary
                        if madeLink:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                        ment_id, anaphor.ID))
                            (cluster_dict, cluster_id_list
                                    ) = mergeClustersByMentionIDs(
                                    ment_id,
                                    anaphor.ID,
                                    mention_dict,
                                    cluster_dict,
                                    cluster_id_list)
                            break
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list
コード例 #5
0
def sieveHeadMatch(
    mention_id_list,
    mention_dict,
    cluster_dict,
    cluster_id_list,
    strictness,
    verbosity,
):
    mention_ids_per_sentence = get_mention_id_list_per_sentence(
        mention_id_list, mention_dict)
    if verbosity == 'high':
        print('Doing head-matching with strictness %d...' % strictness)
    for cluster_id in cluster_id_list[:]:
        # Initialize linking constraints here
        entityHeadMatch = False
        wordInclusion = False
        compModsOnly = False
        relaxedHeadMatch = False
        bothNE = False
        IwithinI = False
        madeLink = False  # If a link has been made, look at the next cluster
        if madeLink:
            continue
        cluster = cluster_dict[cluster_id]
        anaphor = mention_dict[cluster.mentionList[
            0]]  # Only consider first mention in cluster for resolution
        if (not anaphor.headWords
            ):  # If no headwords, head-matching is not going to work
            continue
            # Find all words and modifiers in the anaphoric mention
        anaphorMods = []
        anaphorWords = []
        for tokenAttrib in anaphor.tokenAttribs:
            if tokenAttrib["lemma"] not in stopWords:
                anaphorWords.append(tokenAttrib["lemma"])
            if tokenAttrib["rel"] == "mod":
                anaphorMods.append(tokenAttrib["lemma"])
                # Cycle through sentences backwards
        for sent_id in range(anaphor.sentNum, 0, -1):
            if madeLink:
                break
            if sent_id in mention_ids_per_sentence:
                # Cycle through mentions within a sentence forwards
                for candidate_mention_id in mention_ids_per_sentence[sent_id]:
                    if madeLink:
                        break
                    if (candidate_mention_id == anaphor.ID
                        ):  # Don't look ahead of anaphor
                        break
                    candidate_mention = mention_dict[candidate_mention_id]
                    if (
                            not candidate_mention.headWords
                    ):  # Without head words, head-matching is not going to work
                        continue
                    if (candidate_mention.type.lower() == 'name'
                            and anaphor.type.lower() == 'name'):
                        bothNE = True
                    IwithinI = False
                    entityHeadMatch = False
                    relaxedHeadMatch = False
                    if (anaphor.sentNum == candidate_mention.sentNum
                            and anaphor.begin >= candidate_mention.begin
                            and anaphor.end <= candidate_mention.end):
                        IwithinI = True
                        # Find all modifiers in the candidate mention
                    candidateMentionMods = []
                    for tokenAttrib in candidate_mention.tokenAttribs:
                        if tokenAttrib['rel'] == "mod":
                            candidateMentionMods.append(tokenAttrib["lemma"])
                    candidate_cluster = cluster_dict[
                        candidate_mention.clusterID]
                    # Find all words in the candidate cluster
                    candidateWords = []
                    for ment_id in candidate_cluster.mentionList:
                        for tokenAttrib in mention_dict[ment_id].tokenAttribs:
                            if tokenAttrib["lemma"] not in stopWords:
                                candidateWords.append(tokenAttrib["lemma"])
                        if set(anaphor.headWords).issubset(
                                set(mention_dict[ment_id].headWords)
                        ):  # Check for entity head match
                            entityHeadMatch = True
                    if set(anaphor.headWords).issubset(set(
                            candidateWords)):  # Check for relaxed head match
                        relaxedHeadMatch = True
                    if set(anaphorWords).issubset(
                            set(candidateWords)):  # Check for word inclusion
                        wordInclusion = True
                    if set(anaphorMods).issubset(
                            set(candidateMentionMods
                                )):  # Check whether modifiers are compatible
                        compModsOnly = True
                        # Given a strictness, decide to make a link or not
                    if strictness == 3:
                        if entityHeadMatch and wordInclusion and compModsOnly:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                    candidate_cluster.ID,
                                    anaphor.clusterID,
                                ))
                            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                                candidate_mention_id,
                                anaphor.ID,
                                mention_dict,
                                cluster_dict,
                                cluster_id_list,
                            )
                            madeLink = True
                    elif strictness == 2:
                        if entityHeadMatch and wordInclusion:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                    candidate_cluster.ID,
                                    anaphor.clusterID,
                                ))
                            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                                candidate_mention_id,
                                anaphor.ID,
                                mention_dict,
                                cluster_dict,
                                cluster_id_list,
                            )
                            madeLink = True
                    elif strictness == 1:
                        if entityHeadMatch and compModsOnly:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                    candidate_cluster.ID,
                                    anaphor.clusterID,
                                ))
                            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                                candidate_mention_id,
                                anaphor.ID,
                                mention_dict,
                                cluster_dict,
                                cluster_id_list,
                            )
                            madeLink = True
                    elif strictness == 0:
                        # 						if relaxedHeadMatch and wordInclusion and not IwithinI:
                        if (relaxedHeadMatch and wordInclusion and bothNE
                            ):  # Make it slightly more restrictive or not?
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                    ment_id,
                                    anaphor.ID,
                                ))
                            cluster_dict, cluster_id_list = mergeClustersByMentionIDs(
                                candidate_mention_id,
                                anaphor.ID,
                                mention_dict,
                                cluster_dict,
                                cluster_id_list,
                            )
                            madeLink = True
                    if madeLink:
                        break
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list
コード例 #6
0
def sievePronounResolution(mention_id_list, mention_dict, cluster_dict,
        cluster_id_list, verbosity):
    if verbosity == 'high':
        print('Doing pronoun resolution...')
    mention_ids_per_sentence = get_mention_id_list_per_sentence(
        mention_id_list, mention_dict)
    for cluster_id in cluster_id_list[:]:
        # Initialize linking constraints here
        matchNumber = False
        matchGender = False
        matchPerson = False
        # matchAnimacy = False
        matchNER = False
        madeLink = False
        if madeLink:
            continue
        cluster = cluster_dict[cluster_id]
        # Only consider first mention in cluster for resolution
        anaphor = mention_dict[cluster.mentionList[0]]
        # Check for constraints on the anaphor here
        if anaphor.type.lower() != 'pronoun':  # Skip non-pronouns
            continue
        if verbosity == 'high':
            print('Checking this anaphor:',
                    anaphor.tokenList, {
                    'type': anaphor.type,
                    'numb': anaphor.number,
                    'gend': anaphor.gender,
                    'anim': anaphor.animacy,
                    'pers': anaphor.person})

        # Cycle through sentences backwards, but through mentions within a
        # sentence forwards
        for sent_id in range(anaphor.sentNum, max(0, anaphor.sentNum - 3), -1):
            if madeLink:
                break
            if sent_id in mention_ids_per_sentence:  # Not empty
                for candidate_mention_id in mention_ids_per_sentence[sent_id]:
                    if madeLink:
                        break
                    # Don't look ahead of anaphor
                    if candidate_mention_id == anaphor.ID:
                        break
                    candidate_cluster = cluster_dict[
                            mention_dict[candidate_mention_id].clusterID]
                    if verbosity == 'high':
                        print('looking at cluster %d' % candidate_cluster.ID)
                    # Check things against the candidate mention here
                    for ment_id in candidate_cluster.mentionList:
                        # Check things against other mentions in the candidate
                        # cluster here, if necessary
                        matchNumber = False
                        matchGender = False
                        matchPerson = False
                        matchNER = False
                        cluster_mention = mention_dict[ment_id]
                        if verbosity == 'high':
                            print(cluster_mention.tokenList,
                                    {
                                        'type': cluster_mention.type,
                                        'numb': cluster_mention.number,
                                        'gend': cluster_mention.gender,
                                        'anim': cluster_mention.animacy,
                                        'netype': cluster_mention.NEtype})
                        if (cluster_mention.number == anaphor.number
                                or anaphor.number == 'unknown'):
                            matchNumber = True
                        if (cluster_mention.gender == anaphor.gender
                                or anaphor.gender == 'unknown'):
                            matchGender = True
                        if anaphor.animacy == 'animate':
                            if (cluster_mention.NEtype == 'person'
                                    or cluster_mention.NEtype == 'misc'):
                                matchNER = True
                            if (cluster_mention.NEtype == ''
                                    or cluster_mention.NEtype == 'unknown'):
                                matchNER = True
                        else:
                            if cluster_mention.NEtype != 'person':
                                matchNER = True
                            if (cluster_mention.NEtype == ''
                                    or cluster_mention.NEtype == 'unknown'):
                                matchNER = True
                        if cluster_mention.type.lower() == 'pronoun':
                            if (cluster_mention.person == anaphor.person
                                    or anaphor.person == 'unknown'):
                                matchPerson = True
                            if (matchNumber and matchGender
                                    and matchPerson and matchNER):
                                madeLink = True
                        else:
                            if matchNumber and matchGender and matchNER:
                                madeLink = True
                        if madeLink:
                            if verbosity == 'high':
                                print('Linking clusters %d and %d' % (
                                        ment_id, anaphor.ID))
                            (cluster_dict, cluster_id_list
                                    ) = mergeClustersByMentionIDs(
                                    candidate_mention_id, anaphor.ID,
                                    mention_dict, cluster_dict,
                                    cluster_id_list)
                            break
    return mention_id_list, mention_dict, cluster_dict, cluster_id_list