def context(self, mention_entity, mention, candidate_entity, candidate): """ Return a Human readable and sieve specific info string of the mention, the candidate and the link for logging proposes. :param mention_entity: The entity of the linked mention. :param mention: The mention. :param candidate_entity: The candidate entity :param candidate: The candidate of the link :return: A ready to read string. """ candidate_words = set([ word for candidate_mention in candidate_entity if candidate_mention[constants.MENTION] != constants.PRONOUN_MENTION for word in self.get_all_words_forms(candidate_mention)]) entity_words = set([ word for n_mention in mention_entity if n_mention[constants.MENTION] != constants.PRONOUN_MENTION for word in self.get_all_words_forms(n_mention) if not stopwords.extended_stop_words(word)]) rules.get_head_word_form(self.graph_builder, mention).lower() head_word_form = rules.get_head_word_form(self.graph_builder, mention).lower() if head_word_form in entity_words: entity_words.remove(head_word_form) return ( "%s -%s- %s %s| %s -%s- %s ", mention[FORM], self.graph_builder.get_root(mention)[FORM], entity_words, mention_entity, candidate[FORM], self.graph_builder.get_root(candidate)[FORM], candidate_words)
def same_proper_head_last_word(self, entity, candidate_entity): """Check if all the modifiers of the candidate appears in the first mention of the entity. :param entity: The entity cluster. :param candidate_entity: The entity where the candidate appears. :return: True or false """ for candidate_mention in candidate_entity: # Heads must match and be NNP candidate_head = self.get_head_word(candidate_mention) if not pos_tags.proper_noun(candidate_head[POS]): continue # The head word must be the last word of the relaxed form candidate_head_string = rules.get_head_word_form( self.graph_builder, candidate_mention).lower() candidate_relaxed_form = self.relaxed_form(candidate_mention) if not candidate_relaxed_form.endswith(candidate_head_string): continue # Get al the proper Nouns until the head word candidate_words = self.relaxed_form_word(candidate_mention) candidate_proper_noun = set([ word[FORM].lower() for word in candidate_words if pos_tags.proper_noun(word[POS])]) for entity_mention in entity: # Heads must match and be NNP mention_head = self.get_head_word(entity_mention) if not pos_tags.proper_noun(mention_head[POS]): continue mention_head_string = rules.get_head_word_form( self.graph_builder, entity_mention).lower() if not (mention_head_string == candidate_head_string): # heads must be the last word of the relaxed form of the mention continue # The head word must be the last word of the relaxed form mention_relaxed_form = self.relaxed_form(entity_mention) if not mention_relaxed_form.endswith(mention_head_string): continue mention_words = self.relaxed_form_word(entity_mention) mention_proper_noun = set( [word[FORM].lower() for word in mention_words if pos_tags.proper_noun(word[POS])]) if candidate_proper_noun.difference(mention_proper_noun) and \ mention_proper_noun.difference(candidate_proper_noun): continue return True return False
def word_inclusion(self, entity, mention, candidate_entity): """ Check if every word in the candidate entity(s) is included in the mention entity words except stop words. :param entity: The entity cluster. :param mention: The current mention of the entity. :param candidate_entity: all the entities where the candidate appears. :return: True or false """ # Change mention / candidates form candidate_words = set([ word for candidate_mention in candidate_entity if candidate_mention[constants.MENTION] != constants.PRONOUN_MENTION for word in self.get_all_words_forms(candidate_mention)]) entity_words = set([ word for n_mention in entity if n_mention[constants.MENTION] != constants.PRONOUN_MENTION for word in self.get_all_words_forms(n_mention) if not stopwords.stop_words(word) if not stopwords.extended_stop_words(word) if not pronouns.all(word)] ) head_word_form = rules.get_head_word_form(self.graph_builder, mention).lower() if head_word_form in entity_words: entity_words.remove(head_word_form) if len(entity_words - candidate_words) > 0: return False return True
def get_modifiers(self, element): """ Get the forms of the modifiers of a syntactic element. :param element: A syntactic element :return: List of strings, the forms of the words that appears in the element and are mods. """ element_head = rules.clean_string(rules.get_head_word_form(self.graph_builder, element)) all_mods = set([rules.clean_string(word[FORM]) for word in self.get_words(element) if pos_tags.mod_forms(word[POS])]) all_mods.difference_update({element_head}) return all_mods
def head_match(self, mention, entity, candidate, candidate_entity): """Checks if the mention an candidate head are related in more relaxed algorithm. :param mention: The mention of reference. :param entity: The entity of the current mention. :param candidate: The candidate to evaluate. :param candidate_entity: The entity where the candidate appears. :return True of False """ mention_head = self.get_head_word(mention) mention_head_form = rules.get_head_word_form(self.graph_builder, mention).lower() candidate_head = self.get_head_word(candidate) candidate_head_form = \ rules.get_head_word_form(self.graph_builder, candidate).lower() if ner_tags.mention_ner(mention.get(NER)): if mention.get(NER) == candidate.get(NER): if pos_tags.proper_noun(mention_head[POS]): for word_form in self.get_all_words_forms(candidate): if mention_head_form == word_form: return True if len(mention_head_form) > 2 and \ word_form.startswith(mention_head_form): return True if pos_tags.proper_noun(candidate_head[POS]): for word_form in self.get_all_words_forms(mention): if candidate_head_form == word_form: return True if len(candidate_head_form) > 2 and \ word_form.startswith(candidate_head_form): return True if rules.get_head_word_form(self.graph_builder, mention).lower() == \ rules.get_head_word_form(self.graph_builder, candidate): return True return False
def _get_animacy(self, mention): """Determines the gender of the word. :param mention: The mention which animacy is wanted. :return: ANIMATE, INANIMATE or UNKNOWN constant """ head_word = self.graph_builder.get_head_word(mention) word_form = rules.get_head_word_form(self.graph_builder, mention) word_ner = mention.get(NER) word_pos = head_word.get(POS) # Normalize parameters normalized_ner = word_ner normalized_form = word_form.lower() normalized_form = re.sub("\d", "0", normalized_form) normalized_pos = word_pos.replace("$", "") # Pronouns if pos_tags.pronoun(normalized_pos) or pronouns.all(normalized_form): if pronouns.inanimate(normalized_form): return INANIMATE elif pronouns.animate(normalized_form): return ANIMATE else: return UNKNOWN # NER if ner_tags.animate(normalized_ner): return ANIMATE if ner_tags.inanimate(normalized_ner): return INANIMATE # Use the mention POS to determine the feature if pos_tags.inanimate(word_pos): return INANIMATE if pos_tags.animate(word_pos): return ANIMATE # Bergsma Lists if self.use_bergsma_number_lists: if word_form in animate_words: return ANIMATE if word_form in inanimate_words: return INANIMATE return UNKNOWN
def are_speaker_speech(self, speaker, speech): """ Tho mention are in a speaker speech relation? :param speaker: The mention that is a speaker :param speech: The mention that is inside a speech. :return: True or False """ speech_speaker = speech.get(SPEAKER, False) # TODO check this Only heads?? if type(speech_speaker) is dict: speaker_words_ids = [ word[ID] for word in self.graph_builder.get_words(speaker) ] return speech_speaker[ID] in speaker_words_ids else: speaker_head_word = rules.get_head_word_form(self.graph_builder, speaker)\ .lower() for word in speech_speaker.split(" "): if word.lower() == speaker_head_word: return True return False
def _get_gender(self, mention): """ Pass trough a list of selector to get mention gender. :param mention: The mention to get gender :return: MALE, FEMALE, NEUTRAL or UNKNOWN constant. """ head_word = self.graph_builder.get_head_word(mention) headword_pos = head_word[POS] headstring = rules.get_head_word_form(self.graph_builder, mention).lower() # Words until headwords mention_string = [] for word in self.graph_builder.get_words(mention): mention_string.append(word[FORM]) if word[ID] == head_word[ID]: break mention_string = " ".join(mention_string).lower() try: mention_type = mention[MENTION] except KeyError: self.logger.warning("warning: Gender without MENTION TYPE") mention_type = UNKNOWN try: mention_number = mention[NUMBER] except KeyError: self.logger.warning("warning: Gender without MENTION NUMBER") mention_number = UNKNOWN gender = self._pronoun_gender(mention_string, mention_type) if gender is not None: self.logger.debug("Gender: Pronoun") return gender if self.use_probabilistic_gender_classification and mention_number != PLURAL: gender_statistic = self._get_statistic_gender(headstring) if gender_statistic is not None: self.logger.debug("Gender: Statistical") return gender_statistic gender = self._person_gender(mention) if gender is not None: self.logger.debug("Gender: Person") return gender gender = self._pos_gender(headword_pos) if gender: self.logger.debug("Gender: Part-of-speech") return gender if self.use_names_list: gender = self._name_gender(mention_string) if gender: self.logger.debug("Gender: Name list -%s-", headstring) return gender if self.use_bergsma_gender_lists: gender = self._list_gender(mention_string) if gender: self.logger.debug("Gender: List -%s-", headstring) return gender return UNKNOWN
def compare_heads(self, head_a, head_b): return rules.clean_string(rules.get_head_word_form( self.graph_builder, head_a)) == rules.clean_string(rules.get_head_word_form(self.graph_builder, head_b))
def equal_speakers(self, mention, candidate): if mention.get(IS_SPEAKER, False) and candidate.get(IS_SPEAKER, False): return rules.get_head_word_form(self.graph_builder, mention).lower() == \ rules.get_head_word_form(self.graph_builder, candidate).lower() return False