def extract_sub_relations(self, mention_x: MentionDataLight,
                              mention_y: MentionDataLight,
                              relation: RelationType) -> RelationType:
        """
        Check if input mentions has the given relation between them

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight
            relation: RelationType

        Returns:
            RelationType: relation in case mentions has given relation or
                RelationType.NO_RELATION_FOUND otherwise
        """
        if relation is not RelationType.WORD_EMBEDDING_MATCH:
            return RelationType.NO_RELATION_FOUND

        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str
        if StringUtils.is_pronoun(
                mention_x_str.lower()) or StringUtils.is_pronoun(
                    mention_y_str.lower()):
            if not self.contextual:
                return RelationType.NO_RELATION_FOUND

            if mention_x.mention_context is None or mention_y.mention_context is None:
                return RelationType.NO_RELATION_FOUND

        if self.is_word_embed_match(mention_x, mention_y):
            return RelationType.WORD_EMBEDDING_MATCH

        return RelationType.NO_RELATION_FOUND
Ejemplo n.º 2
0
def extract_vocab(mentions: List[MentionData],
                  filter_stop_words: bool) -> List[str]:
    """
    Extract Head, Lemma and mention string from all mentions to create a list of string vocabulary
    Args:
        mentions:
        filter_stop_words:

    Returns:

    """
    vocab = set()
    for mention in mentions:
        head = mention.mention_head
        head_lemma = mention.mention_head_lemma
        tokens_str = mention.tokens_str
        if not filter_stop_words:
            vocab.add(head)
            vocab.add(head_lemma)
            vocab.add(tokens_str)
        else:
            if not StringUtils.is_stop(head):
                vocab.add(head)
            if not StringUtils.is_stop(head_lemma):
                vocab.add(head_lemma)
            if not StringUtils.is_stop(tokens_str):
                vocab.add(tokens_str)
    vocab_set = list(vocab)
    return vocab_set
Ejemplo n.º 3
0
    def extract_sub_relations(self, mention_x: MentionDataLight,
                              mention_y: MentionDataLight,
                              relation: RelationType) -> RelationType:
        """
        Check if input mentions has the given relation between them

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight
            relation: RelationType

        Returns:
            RelationType: relation in case mentions has given relation or
                RelationType.NO_RELATION_FOUND otherwise
        """
        if relation is not RelationType.VERBOCEAN_MATCH:
            return RelationType.NO_RELATION_FOUND

        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str
        if StringUtils.is_pronoun(
                mention_x_str.lower()) or StringUtils.is_pronoun(
                    mention_y_str.lower()):
            return RelationType.NO_RELATION_FOUND

        if self.is_verbocean_relation(mention_x, mention_y):
            return RelationType.VERBOCEAN_MATCH

        return RelationType.NO_RELATION_FOUND
    def is_both_data_or_time(mention1: MentionDataLight,
                             mention2: MentionDataLight) -> bool:
        """
        check if both phrases refers to time or date

        Returns:
            bool
        """
        mention1_ner = mention1.mention_ner
        mention2_ner = mention2.mention_ner

        if mention1_ner is None:
            _, _, _, mention1_ner = StringUtils.find_head_lemma_pos_ner(
                mention1.tokens_str)
        if mention2_ner is None:
            _, _, _, mention2_ner = StringUtils.find_head_lemma_pos_ner(
                mention2.tokens_str)

        is1_time_or_data = 'DATE' in mention1_ner or 'TIME' in mention1_ner
        is2_time_or_data = 'DATE' in mention2_ner or 'TIME' in mention2_ner

        result = False
        if is1_time_or_data and is2_time_or_data:
            result = True

        return result
Ejemplo n.º 5
0
    def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight,
                              relation: RelationType) -> RelationType:
        """
        Check if input mentions has the given relation between them

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight
            relation: RelationType

        Returns:
            RelationType: relation in case mentions has given relation or
                RelationType.NO_RELATION_FOUND otherwise
        """
        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str
        if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun(
                mention_y_str.lower()):
            return RelationType.NO_RELATION_FOUND

        page_x = self.wordnet_impl.get_pages(mention_x)
        page_y = self.wordnet_impl.get_pages(mention_y)

        if page_x and page_y:
            if relation == RelationType.WORDNET_DERIVATIONALLY:
                return self.extract_derivation(page_x, page_y)
            if relation == RelationType.WORDNET_PARTIAL_SYNSET_MATCH:
                return self.extract_partial_synset_match(page_x, page_y)
            if relation == RelationType.WORDNET_SAME_SYNSET:
                return self.extract_same_synset_entity(page_x, page_y)

        return RelationType.NO_RELATION_FOUND
    def extract_sub_relations(self, mention_x: MentionDataLight,
                              mention_y: MentionDataLight,
                              relation: RelationType) -> RelationType:
        """
        Check if input mentions has the given relation between them

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight
            relation: RelationType

        Returns:
            RelationType: relation in case mentions has given relation or
                RelationType.NO_RELATION_FOUND otherwise
        """
        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str

        if StringUtils.is_pronoun(
                mention_x_str.lower()) or StringUtils.is_pronoun(
                    mention_y_str.lower()):
            return RelationType.NO_RELATION_FOUND

        if relation == RelationType.EXACT_STRING:
            return self.extract_exact_string(mention_x, mention_y)
        if relation == RelationType.FUZZY_FIT:
            return self.extract_fuzzy_fit(mention_x, mention_y)
        if relation == RelationType.FUZZY_HEAD_FIT:
            return self.extract_fuzzy_head_fit(mention_x, mention_y)
        if relation == RelationType.SAME_HEAD_LEMMA:
            is_same_lemma = self.extract_same_head_lemma(mention_x, mention_y)
            if is_same_lemma != RelationType.NO_RELATION_FOUND:
                return relation

        return RelationType.NO_RELATION_FOUND
    def extract_all_relations(
            self, mention_x: MentionDataLight,
            mention_y: MentionDataLight) -> Set[RelationType]:
        """
        Try to find if mentions has anyone or more of the relations this class support

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            Set[RelationType]: One or more of: RelationType.EXACT_STRING, RelationType.FUZZY_FIT,
                RelationType.FUZZY_HEAD_FIT, RelationType.SAME_HEAD_LEMMA,
                RelationType.SAME_HEAD_LEMMA_RELAX
        """
        relations = set()
        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str

        if StringUtils.is_pronoun(
                mention_x_str.lower()) or StringUtils.is_pronoun(
                    mention_y_str.lower()):
            relations.add(RelationType.NO_RELATION_FOUND)
            return relations

        relations.add(self.extract_exact_string(mention_x, mention_y))
        relations.add(self.extract_fuzzy_fit(mention_x, mention_y))
        relations.add(self.extract_fuzzy_head_fit(mention_x, mention_y))
        relations.add(self.extract_same_head_lemma(mention_x, mention_y))

        if len(relations) == 0:
            relations.add(RelationType.NO_RELATION_FOUND)

        return relations
Ejemplo n.º 8
0
    def __init__(self, orig_phrase: str = None, orig_phrase_norm: str = None,
                 wiki_title: str = None, wiki_title_norm: str = None,
                 score: int = 0, pageid: int = 0, description: str = None,
                 relations: WikipediaPageExtractedRelations = None) -> None:
        """
        Object represent a Wikipedia Page and extracted fields.

        Args:
            orig_phrase (str): original search phrase
            orig_phrase_norm (str): original search phrase normalized
            wiki_title (str): page title
            wiki_title_norm (str): page title normalized
            score (int): score for getting wiki_title from orig_phrase
            pageid (int): the unique page identifier
            description (str, optional): the page description
            relations (WikipediaPageExtractedRelations): Object that represent all
                                                         extracted Wikipedia relations
        """
        self.orig_phrase = orig_phrase
        if orig_phrase_norm is None:
            self.orig_phrase_norm = StringUtils.normalize_str(orig_phrase)
        else:
            self.orig_phrase_norm = orig_phrase_norm

        self.wiki_title = wiki_title.replace(DISAMBIGUATION_TITLE, '')
        if wiki_title_norm is None:
            self.wiki_title_norm = StringUtils.normalize_str(wiki_title)
        else:
            self.wiki_title_norm = wiki_title_norm

        self.score = score
        self.pageid = int(pageid)
        self.description = description
        self.relations = relations
    def is_both_opposite_personal_pronouns(phrase1: str, phrase2: str) -> bool:
        """
        check if both phrases refers to pronouns

        Returns:
            bool
        """
        result = False
        if StringUtils.is_pronoun(phrase1.lower()) and StringUtils.is_pronoun(
                phrase2.lower()):
            result = True

        return result
Ejemplo n.º 10
0
    def extract_synonyms_and_derivation(word):
        lemma_names = set()
        derivationally_related_forms = set()
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                lemma_name = lemma.name().replace('_', ' ')
                if not StringUtils.is_stop(lemma_name.lower()):
                    lemma_names.add(lemma_name)

                derivationally_related_forms.update(
                    [l.name().replace('_', ' ') for l in lemma.derivationally_related_forms()
                     if not StringUtils.is_stop(l.name().lower())])

        return lemma_names, derivationally_related_forms
Ejemplo n.º 11
0
 def __init__(
     self,
     tokens_str: str,
     mention_context: str = None,
     mention_head: str = None,
     mention_head_lemma: str = None,
     mention_pos: str = None,
     mention_ner: str = None,
 ):
     """
     Object represent a mention with only text values
     Args:
         tokens_str: str the tokens combine text (join with space)
         mention_head: str
         mention_head_lemma: str
     """
     self.tokens_str = tokens_str
     self.mention_context = mention_context
     if not mention_head and not mention_head_lemma:
         (
             self.mention_head,
             self.mention_head_lemma,
             self.mention_head_pos,
             self.mention_ner,
         ) = StringUtils.find_head_lemma_pos_ner(str(tokens_str))
     else:
         self.mention_head = mention_head
         self.mention_head_lemma = mention_head_lemma
         self.mention_head_pos = mention_pos
         self.mention_ner = mention_ner
Ejemplo n.º 12
0
    def get_pages(self, mention):
        if mention.tokens_str in self.cache:
            return self.cache[mention.tokens_str]

        head_synonyms, head_names_derivationally = self.extract_synonyms_and_derivation(
            mention.mention_head)
        head_lemma_synonyms, head_lemma_derivationally = self.extract_synonyms_and_derivation(
            mention.mention_head_lemma)
        clean_phrase = StringUtils.normalize_str(mention.tokens_str)
        all_clean_words_synonyms = self.all_clean_words_synonyms(clean_phrase)

        wordnet_page = WordnetPage(
            mention.tokens_str,
            clean_phrase,
            mention.mention_head,
            mention.mention_head_lemma,
            head_synonyms,
            head_lemma_synonyms,
            head_names_derivationally,
            head_lemma_derivationally,
            all_clean_words_synonyms,
        )

        self.cache[mention.tokens_str] = wordnet_page
        return wordnet_page
    def extract_same_head_lemma(mention_x: MentionDataLight,
                                mention_y: MentionDataLight) -> RelationType:
        """
        Check if input mentions has same head lemma relation

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            RelationType.SAME_HEAD_LEMMA or RelationType.NO_RELATION_FOUND
        """
        if StringUtils.is_preposition(mention_x.mention_head_lemma.lower()) or \
                StringUtils.is_preposition(mention_y.mention_head_lemma.lower()):
            return RelationType.NO_RELATION_FOUND
        if mention_x.mention_head_lemma.lower() == mention_y.mention_head_lemma.lower():
            return RelationType.SAME_HEAD_LEMMA
        return RelationType.NO_RELATION_FOUND
Ejemplo n.º 14
0
 def all_clean_words_synonyms(clean_phrase):
     words = clean_phrase.split()
     return [
         set([
             lemma.lower().replace("_", " ") for synset in wn.synsets(w)
             for lemma in synset.lemma_names()
             if not StringUtils.is_stop(lemma.lower())
         ]) for w in words
     ]
Ejemplo n.º 15
0
    def extract_all_relations(
            self, mention_x: MentionDataLight,
            mention_y: MentionDataLight) -> Set[RelationType]:
        """
        Try to find if mentions has anyone or more of the relations this class support

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            Set[RelationType]: One or more of: RelationType.WORDNET_SAME_SYNSET_ENTITY,
                RelationType.WORDNET_SAME_SYNSET_EVENT, RelationType.WORDNET_PARTIAL_SYNSET_MATCH,
                RelationType.WORDNET_DERIVATIONALLY
        """
        relations = set()
        mention_x_str = mention_x.tokens_str
        mention_y_str = mention_y.tokens_str
        if StringUtils.is_pronoun(
                mention_x_str.lower()) or StringUtils.is_pronoun(
                    mention_y_str.lower()):
            relations.add(RelationType.NO_RELATION_FOUND)
            return relations

        page_x = self.wordnet_impl.get_pages(mention_x)
        page_y = self.wordnet_impl.get_pages(mention_y)

        if page_x and page_y:
            deriv_rel = self.extract_derivation(page_x, page_y)
            part_syn_rel = self.extract_partial_synset_match(page_x, page_y)
            same_syn_rel = self.extract_same_synset_entity(page_x, page_y)
            if deriv_rel != RelationType.NO_RELATION_FOUND:
                relations.add(deriv_rel)
            if part_syn_rel != RelationType.NO_RELATION_FOUND:
                relations.add(part_syn_rel)
            if same_syn_rel != RelationType.NO_RELATION_FOUND:
                relations.add(same_syn_rel)

        if len(relations) == 0:
            relations.add(RelationType.NO_RELATION_FOUND)

        return relations
Ejemplo n.º 16
0
    def extract_fuzzy_head_fit(mention_x: MentionDataLight,
                               mention_y: MentionDataLight) -> RelationType:
        """
        Check if input mentions has fuzzy head fit relation

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            RelationType.FUZZY_HEAD_FIT or RelationType.NO_RELATION_FOUND
        """
        if StringUtils.is_preposition(mention_x.mention_head_lemma.lower(
        )) or StringUtils.is_preposition(mention_y.mention_head_lemma.lower()):
            return RelationType.NO_RELATION_FOUND

        mention_y_tokens = mention_y.tokens_str.split()
        mention_x_tokens = mention_x.tokens_str.split()
        if mention_x.mention_head in mention_y_tokens or mention_y.mention_head in mention_x_tokens:
            return RelationType.FUZZY_HEAD_FIT
        return RelationType.NO_RELATION_FOUND
Ejemplo n.º 17
0
    def extract_relations_from_text_v0(self, text):
        self.disambiguation_links = set()
        self.categories = set()
        self.title_parenthesis = set()
        self.be_comp = set()

        self.disambiguation_links_norm = set()
        self.categories_norm = set()
        self.title_parenthesis_norm = set()
        self.be_comp_norm = set()

        ext_links = set()
        title_parenthesis = set()

        text_lines = text.split('\n')
        for line in text_lines:
            cat_links = self.extract_categories(line)
            if not self.is_part_name:
                self.is_part_name = self.is_name_part(line)
                if not self.is_part_name and [
                        s for s in PART_NAME_CATEGORIES if s in cat_links
                ]:
                    self.is_part_name = True

            self.categories.update(cat_links)
            self.categories_norm.update(
                StringUtils.normalize_string_list(cat_links))

            links, parenthesis_links = self.extract_links_and_parenthesis(line)
            ext_links.update(links)
            title_parenthesis.update(parenthesis_links)

        if self.is_disambiguation:
            self.disambiguation_links = ext_links
            self.disambiguation_links_norm = StringUtils.normalize_string_list(
                ext_links)
            self.title_parenthesis = title_parenthesis
            self.title_parenthesis_norm = StringUtils.normalize_string_list(
                title_parenthesis)
    def extract_exact_string(mention_x: MentionDataLight,
                             mention_y: MentionDataLight) -> RelationType:
        """
        Check if input mentions has exact string relation

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            RelationType.EXACT_STRING or RelationType.NO_RELATION_FOUND
        """
        relation = RelationType.NO_RELATION_FOUND
        mention1_str = mention_x.tokens_str
        mention2_str = mention_y.tokens_str
        if StringUtils.is_preposition(mention1_str.lower()) or \
                StringUtils.is_preposition(mention2_str.lower()):
            return relation

        if mention1_str.lower() == mention2_str.lower():
            relation = RelationType.EXACT_STRING

        return relation
Ejemplo n.º 19
0
def test_is_preposition():
    assert StringUtils.is_preposition("the") is False
    assert StringUtils.is_preposition("on")
Ejemplo n.º 20
0
    def read_json_mention_data_line(mention_line: str):
        """

        Args:
            mention_line: a Json representation of a single mention

        Returns:
            MentionData object
        """
        mention_data = None
        try:
            topic_id = None
            coref_chain = None
            doc_id = None
            sent_id = None
            tokens_numbers = None
            score = -1
            mention_type = None
            predicted_coref_chain = None
            mention_context = None
            is_continue = False
            is_singleton = False
            mention_pos = None
            mention_ner = None

            mention_text = mention_line['tokens_str']

            if 'topic_id' in mention_line:
                topic_id = mention_line['topic_id']

            if 'coref_chain' in mention_line:
                coref_chain = mention_line['coref_chain']

            if 'doc_id' in mention_line:
                doc_id = mention_line['doc_id']
                if '.xml' not in doc_id:
                    doc_id = doc_id + '.xml'

            if 'sent_id' in mention_line:
                sent_id = mention_line['sent_id']

            if 'tokens_number' in mention_line:
                tokens_numbers = mention_line['tokens_number']

            if 'mention_context' in mention_line:
                mention_context = mention_line['mention_context']

            if 'mention_head' in mention_line and 'mention_head_lemma' in mention_line:
                mention_head = mention_line['mention_head']
                mention_head_lemma = mention_line['mention_head_lemma']
                if 'mention_head_pos' in mention_line:
                    mention_pos = mention_line['mention_head_pos']
                if 'mention_ner' in mention_line:
                    mention_ner = mention_line['mention_ner']
            else:
                mention_head, mention_head_lemma, mention_pos, \
                    mention_ner = StringUtils.find_head_lemma_pos_ner(str(mention_text))

            if 'mention_type' in mention_line:
                mention_type = mention_line['mention_type']
            if 'score' in mention_line:
                score = mention_line['score']

            if 'is_continuous' in mention_line:
                is_continue = mention_line['is_continuous']

            if 'is_singleton' in mention_line:
                is_singleton = mention_line['is_singleton']

            if 'predicted_coref_chain' in mention_line:
                predicted_coref_chain = mention_line['predicted_coref_chain']

            mention_data = MentionData(
                topic_id, doc_id, sent_id, tokens_numbers, mention_text,
                mention_context, mention_head, mention_head_lemma, coref_chain,
                mention_type, is_continue, is_singleton, score,
                predicted_coref_chain, mention_pos, mention_ner)
        except Exception:
            print('Unexpected error:', sys.exc_info()[0])
            raise Exception('failed reading json line-' + str(mention_line))

        return mention_data
Ejemplo n.º 21
0
def test_is_stopword():
    assert StringUtils.is_stop("always")
    assert StringUtils.is_stop("sunday") is False
Ejemplo n.º 22
0
    def read_json_mention_data_line(mention_line: str):
        """
        Args:
            mention_line: a Json representation of a single mention

        Returns:
            MentionData object
        """
        # pylint: disable=too-many-branches

        try:
            topic_id = None
            coref_chain = None
            doc_id = None
            sent_id = None
            tokens_numbers = None
            score = -1
            mention_type = None
            predicted_coref_chain = None
            mention_context = None
            is_continue = False
            is_singleton = False
            mention_pos = None
            mention_ner = None
            mention_index = -1

            mention_text = mention_line["tokens_str"]

            if "topic_id" in mention_line:
                topic_id = mention_line["topic_id"]

            if "coref_chain" in mention_line:
                coref_chain = mention_line["coref_chain"]

            if "doc_id" in mention_line:
                doc_id = mention_line["doc_id"]
                if ".xml" not in doc_id:
                    doc_id = doc_id + ".xml"

            if "sent_id" in mention_line:
                sent_id = mention_line["sent_id"]

            if "tokens_number" in mention_line:
                tokens_numbers = mention_line["tokens_number"]

            if "mention_context" in mention_line:
                mention_context = mention_line["mention_context"]

            if "mention_head" in mention_line and "mention_head_lemma" in mention_line:
                mention_head = mention_line["mention_head"]
                mention_head_lemma = mention_line["mention_head_lemma"]
                if "mention_head_pos" in mention_line:
                    mention_pos = mention_line["mention_head_pos"]
                if "mention_ner" in mention_line:
                    mention_ner = mention_line["mention_ner"]
            else:
                (
                    mention_head,
                    mention_head_lemma,
                    mention_pos,
                    mention_ner,
                ) = StringUtils.find_head_lemma_pos_ner(str(mention_text))

            if "mention_type" in mention_line:
                mention_type = mention_line["mention_type"]
            if "score" in mention_line:
                score = mention_line["score"]

            if "is_continuous" in mention_line:
                is_continue = mention_line["is_continuous"]

            if "is_singleton" in mention_line:
                is_singleton = mention_line["is_singleton"]

            if "predicted_coref_chain" in mention_line:
                predicted_coref_chain = mention_line["predicted_coref_chain"]

            if "mention_index" in mention_line:
                mention_index = mention_line["mention_index"]

            mention_data = MentionData(
                topic_id,
                doc_id,
                sent_id,
                tokens_numbers,
                mention_text,
                mention_context,
                mention_head,
                mention_head_lemma,
                coref_chain,
                mention_type,
                is_continue,
                is_singleton,
                score,
                predicted_coref_chain,
                mention_pos,
                mention_ner,
                mention_index,
            )
        except Exception:
            print("Unexpected error:", sys.exc_info()[0])
            raise Exception("failed reading json line-" + str(mention_line))

        return mention_data
Ejemplo n.º 23
0
def test_is_stopword():
    assert StringUtils.is_stop('always')
    assert StringUtils.is_stop('sunday') is False
Ejemplo n.º 24
0
def test_is_pronoun():
    assert StringUtils.is_pronoun('anybody')
    assert StringUtils.is_pronoun('the') is False
Ejemplo n.º 25
0
def test_is_determiner():
    assert StringUtils.is_determiner('the')
    assert StringUtils.is_determiner('on') is False
Ejemplo n.º 26
0
def test_is_pronoun():
    assert StringUtils.is_pronoun("anybody")
    assert StringUtils.is_pronoun("the") is False
Ejemplo n.º 27
0
def test_is_preposition():
    assert StringUtils.is_preposition('the') is False
    assert StringUtils.is_preposition('on')
Ejemplo n.º 28
0
def test_is_determiner():
    assert StringUtils.is_determiner("the")
    assert StringUtils.is_determiner("on") is False