def sieveDummy(mention_id_list, mention_dict, cluster_dict, cluster_id_list): for idx, mention_id in enumerate(mention_id_list): if ( idx % 2 == 1 ): # Link every second mention with the mention 1 position back in the list cluster_dict, cluster_id_list = mergeClustersByMentionIDs( mention_id_list[idx], mention_id_list[idx - 1], mention_dict, cluster_dict, cluster_id_list, ) return mention_id_list, mention_dict, cluster_dict, cluster_id_list
def sievePreciseConstructs(mention_id_list, mention_dict, cluster_dict, cluster_id_list, verbosity): mention_ids_per_sentence = get_mention_id_list_per_sentence( mention_id_list, mention_dict) if verbosity == 'high': print('Applying Precise Constructs...') for cluster_id in cluster_id_list[:]: madeLink = False cluster = cluster_dict[cluster_id] # Only consider first mention in cluster for resolution anaphor = mention_dict[cluster.mentionList[0]] # Cycle through sentences backwards, but through mentions within a # sentence forwards for sent_id in range(anaphor.sentNum, 0, -1): if madeLink: break if sent_id in mention_ids_per_sentence: # Not empty for candidate_mention_id in mention_ids_per_sentence[sent_id]: if madeLink: break # Don't look ahead of anaphor if candidate_mention_id == anaphor.ID: break candidate_cluster = cluster_dict[ mention_dict[candidate_mention_id].clusterID] for ment_id in candidate_cluster.mentionList: root = mention_dict[ment_id].tree.getroot() # print(mention_dict[ment_id].tokenAttribs, # mention_dict[anaphor.ID].headWords, # mention_dict[anaphor.ID].tokenAttribs) # Appositive: link two mentions if they are in an # appositive contstructions. appos = appositive(root) if (appos and len(mention_dict[anaphor.ID].tokenList) > 2): for ids in mention_dict[ment_id].tokenAttribs: if (ids['id'], ids['root']) not in appos[0]: continue for ana_ids in mention_dict[ anaphor.ID].tokenAttribs: if (ana_ids['id'], ana_ids['root']) in appos[1]: madeLink = True break if madeLink: break # Predicate nominative: Two mentions (nominal or # pronominal) are in a copulative subject-object # relation pred = pred_nom(root) if pred: for ids in mention_dict[ment_id].tokenAttribs: if (ids['id'], ids['root']) not in pred[0]: continue for ana_ids in mention_dict[ anaphor.ID].tokenAttribs: if (ana_ids['id'], ana_ids['root']) in pred[1]: madeLink = True break if madeLink: break # Role appositive # if mention_dict[anaphor.ID].NEtype == 'person' and mention_dict[ment_id].animacy == 'animate':# and mention_dict[anaphor.ID].gender != 'neuter': # hw = mention_dict[ment_id].headWords + mention_dict[anaphor.ID].tokenList # h = mention_dict[ment_id].headWords # t = [] # for hw in mention_dict[anaphor.ID].tokenList: # h.append(hw.lower()) # for tk in mention_dict[ment_id].tokenList: # t.append(tk.lower()) # if h == t: # madeLink == True # print "Role Appositive" # Relative Pronoun: mention is a relative pronoun that # modifies the head of the antecedent NP if mention_dict[anaphor.ID].pron_type == 'betr': rp = get_rel_pron( root, mention_dict[anaphor.ID].tokenAttribs[0]) if len(rp) > 0: for at in mention_dict[ment_id].tokenAttribs: if rp[0][0]['id'] == at['id']: madeLink = True break # Acronym: Both mentions are tagged as NNP and one of # them is an acronym of the other mention_acr = acronyms(mention_dict[ment_id].tokenList) anaphor_acr = acronyms( mention_dict[anaphor.ID].tokenList) if (mention_dict[ment_id].type == 'name' and mention_dict[anaphor.ID].type == 'name'): if (mention_acr in mention_dict[anaphor.ID].tokenList or anaphor_acr in mention_dict[ment_id].tokenList): madeLink = True # Demonym: one of the mentions is a demonym of the other if len(mention_dict[ment_id].tokenList) == 1: if mention_dict[ment_id].tokenList[0] in Demo: if (Demo[mention_dict[ment_id].tokenList[0]] == mention_dict[anaphor.ID].tokenList[0]): madeLink = True if len(mention_dict[anaphor.ID].tokenList) == 1: if mention_dict[anaphor.ID].tokenList[0] in Demo: if (Demo[mention_dict[anaphor.ID].tokenList[0]] == mention_dict[ment_id].tokenList[0]): madeLink = True if madeLink: if verbosity == 'high': print('Linking clusters %d and %d' % (ment_id, anaphor.ID)) (cluster_dict, cluster_id_list) = mergeClustersByMentionIDs( ment_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list) break return mention_id_list, mention_dict, cluster_dict, cluster_id_list
def sieveStringMatch(mention_id_list, mention_dict, cluster_dict, cluster_id_list, verbosity): mention_ids_per_sentence = get_mention_id_list_per_sentence( mention_id_list, mention_dict) if verbosity == 'high': print('Doing exact + relaxed string matching...') for cluster_id in cluster_id_list[:]: ExactEntityMatch = False RelaxedEntityMatch = False madeLink = False if madeLink: continue cluster = cluster_dict[cluster_id] anaphor = mention_dict[cluster.mentionList[ 0]] # Only consider first mention in cluster for resolution anaphor_relaxedtokenlist = [] for token in anaphor.tokenList: if len(anaphor.headWords) > 0: if token == anaphor.headWords[0]: anaphor_relaxedtokenlist.append(token.lower()) break else: anaphor_relaxedtokenlist.append(token.lower()) for sent_id in range( anaphor.sentNum, 0, -1 ): # Cycle through sentences backwards, but through mentions within a sentence forwards if madeLink: break if sent_id in mention_ids_per_sentence: # Not empty for candidate_mention_id in mention_ids_per_sentence[sent_id]: if madeLink: break if (candidate_mention_id == anaphor.ID ): # Don't look ahead of anaphor break candidate_cluster = cluster_dict[ mention_dict[candidate_mention_id].clusterID] for ment_id in candidate_cluster.mentionList: '''Exact Match''' if mention_dict[ ment_id].tokenList == anaphor.tokenList: ExactEntityMatch = True '''Relaxed Match''' ment_relaxedtokenlist = [] for token in mention_dict[ment_id].tokenList: if len(mention_dict[ment_id].headWords) > 0: if token == mention_dict[ment_id].headWords[0]: ment_relaxedtokenlist.append(token.lower()) break else: ment_relaxedtokenlist.append(token.lower()) if len(ment_relaxedtokenlist) > 0: if (ment_relaxedtokenlist == anaphor_relaxedtokenlist): RelaxedEntityMatch = True if ExactEntityMatch or RelaxedEntityMatch: if verbosity == 'high': print('Linking clusters %d and %d' % ( ment_id, anaphor.ID, )) cluster_dict, cluster_id_list = mergeClustersByMentionIDs( ment_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list, ) madeLink = True if madeLink: break return mention_id_list, mention_dict, cluster_dict, cluster_id_list
def sieveSpeakerIdentification(mention_id_list, mention_dict, cluster_dict, cluster_id_list, verbosity): mention_ids_per_sentence = get_mention_id_list_per_sentence( mention_id_list, mention_dict) if verbosity == 'high': print('Speaker identification...') for cluster_id in cluster_id_list[:]: madeLink = False # Initialize linking constraints here if madeLink: continue cluster = cluster_dict[cluster_id] # Only consider first mention in cluster for resolution anaphor = mention_dict[cluster.mentionList[0]] # Check for constraints on the anaphor here # Cycle through sentences backwards, but through mentions within a # sentence forwards for sent_id in range(anaphor.sentNum, 0, -1): if madeLink: break if sent_id in mention_ids_per_sentence: # Not empty for candidate_mention_id in mention_ids_per_sentence[sent_id]: if madeLink: break # Don't look ahead of anaphor if candidate_mention_id == anaphor.ID: break candidate_cluster = cluster_dict[ mention_dict[candidate_mention_id].clusterID] # search for subjects of reporting verbs for ment_id in candidate_cluster.mentionList: root = mention_dict[ment_id].tree.getroot() anaphor_root = mention_dict[anaphor.ID].tree.getroot() su_list = ['su', 'mwu_su', 'np_su'] I = ['ik', 'mij', 'me', 'mijn'] We = ['wij', 'ons', 'onze'] You = ['jij', 'je', 'jullie', 'U', 'u'] Subject = False if mention_dict[ment_id].type in su_list: Subject = True if Subject: anaphorlist = [] sub_verb = find_sub(root) for subj in mention_dict[ment_id].tokenAttribs: for su in sub_verb: if (subj['id'] == su.attrib['id'] and subj['root'] == su.attrib['root']): for node in anaphor_root.iter('node'): at = node.attrib if ('pos' in at and at['pos'] == 'pron'): anaphorlist.append(at) for attrib in anaphor.tokenAttribs: for pron in anaphorlist: if attrib['id'] == pron['id']: if attrib['root'] in I: madeLink = True elif attrib['root'] in You: madeLink = True elif attrib['root'] in We: madeLink = True # Check things against other mentions in the candidate # cluster here, if necessary if madeLink: if verbosity == 'high': print('Linking clusters %d and %d' % ( ment_id, anaphor.ID)) (cluster_dict, cluster_id_list ) = mergeClustersByMentionIDs( ment_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list) break return mention_id_list, mention_dict, cluster_dict, cluster_id_list
def sieveHeadMatch( mention_id_list, mention_dict, cluster_dict, cluster_id_list, strictness, verbosity, ): mention_ids_per_sentence = get_mention_id_list_per_sentence( mention_id_list, mention_dict) if verbosity == 'high': print('Doing head-matching with strictness %d...' % strictness) for cluster_id in cluster_id_list[:]: # Initialize linking constraints here entityHeadMatch = False wordInclusion = False compModsOnly = False relaxedHeadMatch = False bothNE = False IwithinI = False madeLink = False # If a link has been made, look at the next cluster if madeLink: continue cluster = cluster_dict[cluster_id] anaphor = mention_dict[cluster.mentionList[ 0]] # Only consider first mention in cluster for resolution if (not anaphor.headWords ): # If no headwords, head-matching is not going to work continue # Find all words and modifiers in the anaphoric mention anaphorMods = [] anaphorWords = [] for tokenAttrib in anaphor.tokenAttribs: if tokenAttrib["lemma"] not in stopWords: anaphorWords.append(tokenAttrib["lemma"]) if tokenAttrib["rel"] == "mod": anaphorMods.append(tokenAttrib["lemma"]) # Cycle through sentences backwards for sent_id in range(anaphor.sentNum, 0, -1): if madeLink: break if sent_id in mention_ids_per_sentence: # Cycle through mentions within a sentence forwards for candidate_mention_id in mention_ids_per_sentence[sent_id]: if madeLink: break if (candidate_mention_id == anaphor.ID ): # Don't look ahead of anaphor break candidate_mention = mention_dict[candidate_mention_id] if ( not candidate_mention.headWords ): # Without head words, head-matching is not going to work continue if (candidate_mention.type.lower() == 'name' and anaphor.type.lower() == 'name'): bothNE = True IwithinI = False entityHeadMatch = False relaxedHeadMatch = False if (anaphor.sentNum == candidate_mention.sentNum and anaphor.begin >= candidate_mention.begin and anaphor.end <= candidate_mention.end): IwithinI = True # Find all modifiers in the candidate mention candidateMentionMods = [] for tokenAttrib in candidate_mention.tokenAttribs: if tokenAttrib['rel'] == "mod": candidateMentionMods.append(tokenAttrib["lemma"]) candidate_cluster = cluster_dict[ candidate_mention.clusterID] # Find all words in the candidate cluster candidateWords = [] for ment_id in candidate_cluster.mentionList: for tokenAttrib in mention_dict[ment_id].tokenAttribs: if tokenAttrib["lemma"] not in stopWords: candidateWords.append(tokenAttrib["lemma"]) if set(anaphor.headWords).issubset( set(mention_dict[ment_id].headWords) ): # Check for entity head match entityHeadMatch = True if set(anaphor.headWords).issubset(set( candidateWords)): # Check for relaxed head match relaxedHeadMatch = True if set(anaphorWords).issubset( set(candidateWords)): # Check for word inclusion wordInclusion = True if set(anaphorMods).issubset( set(candidateMentionMods )): # Check whether modifiers are compatible compModsOnly = True # Given a strictness, decide to make a link or not if strictness == 3: if entityHeadMatch and wordInclusion and compModsOnly: if verbosity == 'high': print('Linking clusters %d and %d' % ( candidate_cluster.ID, anaphor.clusterID, )) cluster_dict, cluster_id_list = mergeClustersByMentionIDs( candidate_mention_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list, ) madeLink = True elif strictness == 2: if entityHeadMatch and wordInclusion: if verbosity == 'high': print('Linking clusters %d and %d' % ( candidate_cluster.ID, anaphor.clusterID, )) cluster_dict, cluster_id_list = mergeClustersByMentionIDs( candidate_mention_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list, ) madeLink = True elif strictness == 1: if entityHeadMatch and compModsOnly: if verbosity == 'high': print('Linking clusters %d and %d' % ( candidate_cluster.ID, anaphor.clusterID, )) cluster_dict, cluster_id_list = mergeClustersByMentionIDs( candidate_mention_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list, ) madeLink = True elif strictness == 0: # if relaxedHeadMatch and wordInclusion and not IwithinI: if (relaxedHeadMatch and wordInclusion and bothNE ): # Make it slightly more restrictive or not? if verbosity == 'high': print('Linking clusters %d and %d' % ( ment_id, anaphor.ID, )) cluster_dict, cluster_id_list = mergeClustersByMentionIDs( candidate_mention_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list, ) madeLink = True if madeLink: break return mention_id_list, mention_dict, cluster_dict, cluster_id_list
def sievePronounResolution(mention_id_list, mention_dict, cluster_dict, cluster_id_list, verbosity): if verbosity == 'high': print('Doing pronoun resolution...') mention_ids_per_sentence = get_mention_id_list_per_sentence( mention_id_list, mention_dict) for cluster_id in cluster_id_list[:]: # Initialize linking constraints here matchNumber = False matchGender = False matchPerson = False # matchAnimacy = False matchNER = False madeLink = False if madeLink: continue cluster = cluster_dict[cluster_id] # Only consider first mention in cluster for resolution anaphor = mention_dict[cluster.mentionList[0]] # Check for constraints on the anaphor here if anaphor.type.lower() != 'pronoun': # Skip non-pronouns continue if verbosity == 'high': print('Checking this anaphor:', anaphor.tokenList, { 'type': anaphor.type, 'numb': anaphor.number, 'gend': anaphor.gender, 'anim': anaphor.animacy, 'pers': anaphor.person}) # Cycle through sentences backwards, but through mentions within a # sentence forwards for sent_id in range(anaphor.sentNum, max(0, anaphor.sentNum - 3), -1): if madeLink: break if sent_id in mention_ids_per_sentence: # Not empty for candidate_mention_id in mention_ids_per_sentence[sent_id]: if madeLink: break # Don't look ahead of anaphor if candidate_mention_id == anaphor.ID: break candidate_cluster = cluster_dict[ mention_dict[candidate_mention_id].clusterID] if verbosity == 'high': print('looking at cluster %d' % candidate_cluster.ID) # Check things against the candidate mention here for ment_id in candidate_cluster.mentionList: # Check things against other mentions in the candidate # cluster here, if necessary matchNumber = False matchGender = False matchPerson = False matchNER = False cluster_mention = mention_dict[ment_id] if verbosity == 'high': print(cluster_mention.tokenList, { 'type': cluster_mention.type, 'numb': cluster_mention.number, 'gend': cluster_mention.gender, 'anim': cluster_mention.animacy, 'netype': cluster_mention.NEtype}) if (cluster_mention.number == anaphor.number or anaphor.number == 'unknown'): matchNumber = True if (cluster_mention.gender == anaphor.gender or anaphor.gender == 'unknown'): matchGender = True if anaphor.animacy == 'animate': if (cluster_mention.NEtype == 'person' or cluster_mention.NEtype == 'misc'): matchNER = True if (cluster_mention.NEtype == '' or cluster_mention.NEtype == 'unknown'): matchNER = True else: if cluster_mention.NEtype != 'person': matchNER = True if (cluster_mention.NEtype == '' or cluster_mention.NEtype == 'unknown'): matchNER = True if cluster_mention.type.lower() == 'pronoun': if (cluster_mention.person == anaphor.person or anaphor.person == 'unknown'): matchPerson = True if (matchNumber and matchGender and matchPerson and matchNER): madeLink = True else: if matchNumber and matchGender and matchNER: madeLink = True if madeLink: if verbosity == 'high': print('Linking clusters %d and %d' % ( ment_id, anaphor.ID)) (cluster_dict, cluster_id_list ) = mergeClustersByMentionIDs( candidate_mention_id, anaphor.ID, mention_dict, cluster_dict, cluster_id_list) break return mention_id_list, mention_dict, cluster_dict, cluster_id_list