Beispiel #1
0
    def context(self, mention_entity, mention, candidate_entity, candidate):
        """ Return a Human readable and sieve specific info string of the
        mention, the candidate and the link for logging proposes.

        :param mention_entity: The entity of the linked mention.
        :param mention: The mention.
        :param candidate_entity: The candidate entity
        :param candidate: The candidate of the link
        :return: A ready to read string.
        """
        candidate_words = set([
                                  word
                                  for candidate_mention in candidate_entity
                                  if candidate_mention[constants.MENTION] != constants.PRONOUN_MENTION
                                  for word in self.get_all_words_forms(candidate_mention)])
        entity_words = set([
                               word
                               for n_mention in mention_entity
                               if n_mention[constants.MENTION] != constants.PRONOUN_MENTION
                               for word in self.get_all_words_forms(n_mention)
                               if not stopwords.extended_stop_words(word)])
        rules.get_head_word_form(self.graph_builder, mention).lower()
        head_word_form = rules.get_head_word_form(self.graph_builder, mention).lower()
        if head_word_form in entity_words:
            entity_words.remove(head_word_form)
        return (
            "%s -%s- %s %s| %s -%s- %s ",
            mention[FORM], self.graph_builder.get_root(mention)[FORM],
            entity_words, mention_entity, candidate[FORM],
            self.graph_builder.get_root(candidate)[FORM], candidate_words)
Beispiel #2
0
    def same_proper_head_last_word(self, entity, candidate_entity):
        """Check if all the modifiers of the candidate appears in the first
        mention of the entity.

        :param entity: The entity cluster.
        :param candidate_entity: The entity where the candidate appears.
        :return: True or false
        """

        for candidate_mention in candidate_entity:
            # Heads must match and be NNP
            candidate_head = self.get_head_word(candidate_mention)
            if not pos_tags.proper_noun(candidate_head[POS]):
                continue
            # The head word must be the last word of the relaxed form
            candidate_head_string = rules.get_head_word_form(
                self.graph_builder, candidate_mention).lower()
            candidate_relaxed_form = self.relaxed_form(candidate_mention)
            if not candidate_relaxed_form.endswith(candidate_head_string):
                continue
            # Get al the proper Nouns until the head word
            candidate_words = self.relaxed_form_word(candidate_mention)
            candidate_proper_noun = set([
                word[FORM].lower()
                for word
                in candidate_words
                if pos_tags.proper_noun(word[POS])])
            for entity_mention in entity:
                # Heads must match and be NNP
                mention_head = self.get_head_word(entity_mention)
                if not pos_tags.proper_noun(mention_head[POS]):
                    continue
                mention_head_string = rules.get_head_word_form(
                    self.graph_builder, entity_mention).lower()
                if not (mention_head_string == candidate_head_string):
                    # heads must be the last word of the relaxed form of the mention
                    continue
                # The head word must be the last word of the relaxed form
                mention_relaxed_form = self.relaxed_form(entity_mention)
                if not mention_relaxed_form.endswith(mention_head_string):
                    continue
                mention_words = self.relaxed_form_word(entity_mention)
                mention_proper_noun = set(
                    [word[FORM].lower()
                        for word
                        in mention_words
                        if pos_tags.proper_noun(word[POS])])
                if candidate_proper_noun.difference(mention_proper_noun) and \
                        mention_proper_noun.difference(candidate_proper_noun):
                    continue
                return True
        return False
Beispiel #3
0
    def word_inclusion(self, entity, mention, candidate_entity):
        """ Check if every word in the candidate entity(s) is included in the
        mention entity words except stop words.

        :param entity: The entity cluster.
        :param mention: The current mention of the entity.
        :param candidate_entity: all the entities where the candidate appears.
        :return: True or false
        """
        # Change mention / candidates form
        candidate_words = set([
                                  word
                                  for candidate_mention in candidate_entity
                                  if candidate_mention[constants.MENTION] != constants.PRONOUN_MENTION
                                  for word in self.get_all_words_forms(candidate_mention)])
        entity_words = set([
                               word
                               for n_mention in entity
                               if n_mention[constants.MENTION] != constants.PRONOUN_MENTION
                               for word in self.get_all_words_forms(n_mention)
                               if not stopwords.stop_words(word)
                               if not stopwords.extended_stop_words(word)
                               if not pronouns.all(word)]
                           )
        head_word_form = rules.get_head_word_form(self.graph_builder, mention).lower()
        if head_word_form in entity_words:
            entity_words.remove(head_word_form)

        if len(entity_words - candidate_words) > 0:
            return False
        return True
Beispiel #4
0
    def get_modifiers(self, element):
        """ Get the forms of the modifiers of a syntactic element.

        :param element: A syntactic element
        :return: List of strings, the forms of the words that appears in the
        element and are mods.
        """
        element_head = rules.clean_string(rules.get_head_word_form(self.graph_builder, element))
        all_mods = set([rules.clean_string(word[FORM])
                        for word in self.get_words(element)
                        if pos_tags.mod_forms(word[POS])])
        all_mods.difference_update({element_head})
        return all_mods
Beispiel #5
0
    def head_match(self, mention, entity, candidate, candidate_entity):
        """Checks if the mention an candidate head are related in more relaxed algorithm.

        :param mention: The mention of reference.
        :param entity: The entity of the current mention.
        :param candidate: The candidate to evaluate.
        :param candidate_entity: The entity where the candidate appears.

        :return True of False
        """

        mention_head = self.get_head_word(mention)
        mention_head_form = rules.get_head_word_form(self.graph_builder, mention).lower()
        candidate_head = self.get_head_word(candidate)
        candidate_head_form = \
            rules.get_head_word_form(self.graph_builder, candidate).lower()
        if ner_tags.mention_ner(mention.get(NER)):
            if mention.get(NER) == candidate.get(NER):
                if pos_tags.proper_noun(mention_head[POS]):
                    for word_form in self.get_all_words_forms(candidate):
                        if mention_head_form == word_form:
                            return True
                        if len(mention_head_form) > 2 and \
                                word_form.startswith(mention_head_form):
                            return True
                if pos_tags.proper_noun(candidate_head[POS]):
                    for word_form in self.get_all_words_forms(mention):
                        if candidate_head_form == word_form:
                            return True
                        if len(candidate_head_form) > 2 and \
                                word_form.startswith(candidate_head_form):
                            return True

        if rules.get_head_word_form(self.graph_builder, mention).lower() == \
                rules.get_head_word_form(self.graph_builder, candidate):
            return True
        return False
Beispiel #6
0
    def _get_animacy(self, mention):
        """Determines the gender of the word.

        :param mention: The mention which animacy is wanted.
        :return: ANIMATE, INANIMATE or UNKNOWN constant
        """
        head_word = self.graph_builder.get_head_word(mention)
        word_form = rules.get_head_word_form(self.graph_builder, mention)
        word_ner = mention.get(NER)
        word_pos = head_word.get(POS)
        # Normalize parameters
        normalized_ner = word_ner
        normalized_form = word_form.lower()
        normalized_form = re.sub("\d", "0", normalized_form)
        normalized_pos = word_pos.replace("$", "")
        # Pronouns
        if pos_tags.pronoun(normalized_pos) or pronouns.all(normalized_form):
            if pronouns.inanimate(normalized_form):
                return INANIMATE
            elif pronouns.animate(normalized_form):
                return ANIMATE
            else:
                return UNKNOWN
        # NER
        if ner_tags.animate(normalized_ner):
            return ANIMATE
        if ner_tags.inanimate(normalized_ner):
            return INANIMATE

        # Use the mention POS to determine the feature
        if pos_tags.inanimate(word_pos):
            return INANIMATE
        if pos_tags.animate(word_pos):
            return ANIMATE
        # Bergsma Lists
        if self.use_bergsma_number_lists:
            if word_form in animate_words:
                return ANIMATE
            if word_form in inanimate_words:
                return INANIMATE
        return UNKNOWN
Beispiel #7
0
    def are_speaker_speech(self, speaker, speech):
        """ Tho mention are in a speaker speech relation?

        :param speaker: The mention that is a speaker
        :param speech: The mention that is inside a speech.
        :return: True or False
        """
        speech_speaker = speech.get(SPEAKER, False)
        # TODO check this Only heads??
        if type(speech_speaker) is dict:
            speaker_words_ids = [
                word[ID] for word in self.graph_builder.get_words(speaker)
            ]
            return speech_speaker[ID] in speaker_words_ids
        else:
            speaker_head_word = rules.get_head_word_form(self.graph_builder, speaker)\
                .lower()
            for word in speech_speaker.split(" "):
                if word.lower() == speaker_head_word:
                    return True
        return False
Beispiel #8
0
    def _get_gender(self, mention):
        """ Pass trough a list of selector to get mention gender.

        :param mention: The mention to get gender
        :return: MALE, FEMALE, NEUTRAL or UNKNOWN constant.
        """
        head_word = self.graph_builder.get_head_word(mention)
        headword_pos = head_word[POS]
        headstring = rules.get_head_word_form(self.graph_builder,
                                              mention).lower()
        # Words until headwords
        mention_string = []
        for word in self.graph_builder.get_words(mention):
            mention_string.append(word[FORM])
            if word[ID] == head_word[ID]:
                break
        mention_string = " ".join(mention_string).lower()

        try:
            mention_type = mention[MENTION]
        except KeyError:
            self.logger.warning("warning: Gender without MENTION TYPE")
            mention_type = UNKNOWN
        try:
            mention_number = mention[NUMBER]
        except KeyError:
            self.logger.warning("warning: Gender without MENTION NUMBER")
            mention_number = UNKNOWN

        gender = self._pronoun_gender(mention_string, mention_type)
        if gender is not None:
            self.logger.debug("Gender: Pronoun")
            return gender

        if self.use_probabilistic_gender_classification and mention_number != PLURAL:
            gender_statistic = self._get_statistic_gender(headstring)
            if gender_statistic is not None:
                self.logger.debug("Gender: Statistical")
                return gender_statistic

        gender = self._person_gender(mention)
        if gender is not None:
            self.logger.debug("Gender: Person")
            return gender

        gender = self._pos_gender(headword_pos)
        if gender:
            self.logger.debug("Gender: Part-of-speech")
            return gender

        if self.use_names_list:
            gender = self._name_gender(mention_string)
            if gender:
                self.logger.debug("Gender: Name list -%s-", headstring)
                return gender

        if self.use_bergsma_gender_lists:
            gender = self._list_gender(mention_string)
            if gender:
                self.logger.debug("Gender: List -%s-", headstring)
                return gender

        return UNKNOWN
Beispiel #9
0
 def compare_heads(self, head_a, head_b):
     return rules.clean_string(rules.get_head_word_form(
         self.graph_builder, head_a)) == rules.clean_string(rules.get_head_word_form(self.graph_builder, head_b))
Beispiel #10
0
 def equal_speakers(self, mention, candidate):
     if mention.get(IS_SPEAKER, False) and candidate.get(IS_SPEAKER, False):
         return rules.get_head_word_form(self.graph_builder, mention).lower() == \
                rules.get_head_word_form(self.graph_builder, candidate).lower()
     return False