def arrange_resource(wd_mentions_json):
     document_tokens_dict = dict()
     for mention_json in wd_mentions_json:
         mention_data = MentionData.read_json_mention_data_line(
             mention_json)
         mention_tokens = mention_data.tokens_number
         for i in range(0, len(mention_tokens)):
             doc_id = mention_data.doc_id
             sent_id = mention_data.sent_id
             token_map_key = MentionData.static_gen_token_unique_id(
                 doc_id, sent_id, mention_tokens[i])
             document_tokens_dict[token_map_key] = mention_data.coref_chain
     return document_tokens_dict
Ejemplo n.º 2
0
def load_mentions_vocab_from_files(mentions_files, filter_stop_words=False):
    logger.info("Loading mentions files...")
    mentions = []
    for _file in mentions_files:
        mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file))

    return load_mentions_vocab(mentions, filter_stop_words)
Ejemplo n.º 3
0
    def order_mentions_by_topics(self, mentions: str) -> List[Topic]:
        """
        Order mentions to documents topics
        Args:
            mentions: json mentions file

        Returns:
            List[Topic] of the mentions separated by their documents topics
        """
        running_index = 0
        topics = []
        current_topic_ref = None
        for mention_line in mentions:
            mention = MentionData.read_json_mention_data_line(mention_line)

            if self.keep_order:
                if mention.mention_index == -1:
                    mention.mention_index = running_index
                    running_index += 1

            topic_id = mention.topic_id

            if not current_topic_ref or len(topics) > 0 and topic_id != topics[-1].topic_id:
                current_topic_ref = Topic(topic_id)
                topics.append(current_topic_ref)

            current_topic_ref.mentions.append(mention)

        return topics
Ejemplo n.º 4
0
def load_mentions_vocab(mentions_files, filter_stop_words=False):
    logger.info('Loading mentions files...')
    mentions = []
    logger.info('Done loading mentions files, starting local dump creation...')
    for _file in mentions_files:
        mentions.extend(
            MentionData.read_mentions_json_to_mentions_data_list(_file))

    return extract_vocab(mentions, filter_stop_words)
Ejemplo n.º 5
0
def load_mentions_vocab(mentions_files, filter_stop_words=False):
    logger.info('Loading mentions files...')
    mentions = []
    for _file in mentions_files:
        mentions.extend(
            MentionData.read_mentions_json_to_mentions_data_list(_file))

    vocab = extract_vocab(mentions, filter_stop_words)
    logger.info('Done loading mentions files...')
    return vocab
    def extract_within_coref(self, mention: MentionData) -> List[str]:
        tokens = mention.tokens_number
        within_coref_token = []
        for token_id in tokens:
            token_x_id = MentionData.static_gen_token_unique_id(
                str(mention.doc_id), str(mention.sent_id), str(token_id))
            if token_x_id in self.within_doc_coref_chain:
                token_coref_chain = self.within_doc_coref_chain[token_x_id]
                if token_coref_chain:
                    within_coref_token.append(token_coref_chain)
            else:
                within_coref_token.append("-")
                break

        return within_coref_token
def wordnet_dump():
    out_file = args.output
    mentions_file = args.mentions
    logger.info('Loading mentions files...')
    mentions = MentionData.read_mentions_json_to_mentions_data_list(mentions_file)
    logger.info('Done loading mentions files, starting local dump creation...')
    result_dump = dict()
    wordnet = WordnetOnline()
    for mention in mentions:
        page = wordnet.get_pages(mention)
        result_dump[page.orig_phrase] = page

    with open(out_file, 'w') as out:
        json.dump(result_dump, out, default=json_dumper)

    logger.info('Wordnet Dump Created Successfully, '
                'extracted total of %d wn pages', len(result_dump))
    logger.info('Saving dump to file-%s', out_file)
Ejemplo n.º 8
0
    def order_mentions_by_topics(mentions: str) -> List[Topic]:
        """
        Order mentions to documents topics
        Args:
            mentions: json mentions file

        Returns:
            List[Topic] of the mentions separated by their documents topics
        """
        topics = []
        current_topic_ref = None
        for mention_line in mentions:
            mention = MentionData.read_json_mention_data_line(mention_line)
            topic_id = mention.topic_id

            if not current_topic_ref or len(
                    topics) > 0 and topic_id != topics[-1].topic_id:
                current_topic_ref = Topic(topic_id)
                topics.append(current_topic_ref)

            current_topic_ref.mentions.append(mention)

        return topics
def elmo_dump():
    out_file = args.output
    mention_files = list()
    if os.path.isdir(args.mentions):
        for (dirpath, _, files) in os.walk(args.mentions):
            for file in files:
                if file == '.DS_Store':
                    continue

                mention_files.append(join(dirpath, file))
    else:
        mention_files.append(args.mentions)

    mentions = []
    for _file in mention_files:
        mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file))

    elmo_ecb_embeddings = load_elmo_for_vocab(mentions)

    with open(out_file, 'wb') as f:
        pickle.dump(elmo_ecb_embeddings, f)

    logger.info('Saving dump to file-%s', out_file)
Ejemplo n.º 10
0
def get_wordnet_mentions():
    mentions_json = [
        {
            "mention_id": "0",
            "tokens_str": "play",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "1",
            "tokens_str": "game",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "2",
            "tokens_str": "Chair",
            "topic_id": "1ecb"
        },
    ]

    mentions = list()
    for json in mentions_json:
        mentions.append(MentionData.read_json_mention_data_line(json))

    return mentions
Ejemplo n.º 11
0
def get_compute_mentions():
    mentions_json = [
        {
            "mention_id": "0",
            "tokens_str": "Exact String",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "1",
            "tokens_str": "Exact Same Head String",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "2",
            "tokens_str": "Nothing",
            "topic_id": "1ecb"
        },
    ]

    mentions = list()
    for json in mentions_json:
        mentions.append(MentionData.read_json_mention_data_line(json))

    return mentions
Ejemplo n.º 12
0
def get_wiki_mentions():
    mentions_json = [
        {
            "mention_id": "0",
            "tokens_str": "Ellen DeGeneres",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "1",
            "tokens_str": "television host",
            "topic_id": "1ecb"
        },
        {
            "mention_id": "2",
            "tokens_str": "Los Angeles",
            "topic_id": "1ecb"
        },
    ]

    mentions = list()
    for json in mentions_json:
        mentions.append(MentionData.read_json_mention_data_line(json))

    return mentions
Ejemplo n.º 13
0
 def add_mention(self, mention: MentionData) -> None:
     if mention is not None:
         mention.predicted_coref_chain = self.coref_chain
         self.mentions.append(mention)
         self.cluster_strings.append(mention.tokens_str)
         self.mentions_corefs.add(mention.coref_chain)
 def create_ment_id(mention_x: MentionData, mention_y: MentionData) -> str:
     return "_".join(
         [mention_x.get_mention_id(),
          mention_y.get_mention_id()])
Ejemplo n.º 15
0
def get_embedd_mentions():
    mentions_json = [
        {
            "coref_chain":
            "HUM16236184328979740",
            "doc_id":
            "0",
            "mention_context": [
                "Perennial",
                "party",
                "girl",
                "Tara",
                "Reid",
                "checked",
                "herself",
                "into",
                "Promises",
                "Treatment",
                "Center",
                ",",
                "her",
                "rep",
                "told",
                "People",
                ".",
            ],
            "mention_head":
            "Reid",
            "mention_head_lemma":
            "reid",
            "mention_head_pos":
            "PROPN",
            "mention_id":
            "0",
            "mention_index":
            -1,
            "mention_ner":
            "PERSON",
            "mention_type":
            "HUM",
            "predicted_coref_chain":
            None,
            "score":
            -1.0,
            "sent_id":
            0,
            "tokens_number": [3, 4],
            "tokens_str":
            "Tara Reid",
            "topic_id":
            "1ecb",
        },
        {
            "coref_chain":
            "HUM16236184328979740",
            "doc_id":
            "1_12ecb.xml",
            "mention_context": [
                "Tara",
                "Reid",
                "has",
                "checked",
                "into",
                "Promises",
                "Treatment",
                "Center",
                ",",
                "a",
                "prominent",
                "rehab",
                "clinic",
                "in",
                "Los",
                "Angeles",
                ".",
            ],
            "mention_head":
            "Reid",
            "mention_head_lemma":
            "reid",
            "mention_head_pos":
            "PROPN",
            "mention_id":
            "1",
            "mention_index":
            -1,
            "mention_ner":
            "PERSON",
            "mention_type":
            "HUM",
            "predicted_coref_chain":
            None,
            "score":
            -1.0,
            "sent_id":
            1,
            "tokens_number": [0, 1],
            "tokens_str":
            "Tara Reid",
            "topic_id":
            "1ecb",
        },
        {
            "coref_chain":
            "Singleton_LOC_8_1_12ecb",
            "doc_id":
            "1_12ecb.xml",
            "mention_context": [
                "Tara",
                "Reid",
                "has",
                "checked",
                "into",
                "Promises",
                "Treatment",
                "Center",
                ",",
                "a",
                "prominent",
                "rehab",
                "clinic",
                "in",
                "Los",
                "Angeles",
                ".",
            ],
            "mention_head":
            "in",
            "mention_head_lemma":
            "in",
            "mention_head_pos":
            "ADP",
            "mention_id":
            "2",
            "mention_index":
            -1,
            "mention_ner":
            None,
            "mention_type":
            "LOC",
            "predicted_coref_chain":
            None,
            "score":
            -1.0,
            "sent_id":
            1,
            "tokens_number": [13, 14, 15],
            "tokens_str":
            "in Los Angeles",
            "topic_id":
            "1ecb",
        },
        {
            "coref_chain": "HUM16236184328979740",
            "doc_id": "0",
            "mention_context": None,
            "mention_head": "Reid",
            "mention_head_lemma": "reid",
            "mention_head_pos": "PROPN",
            "mention_id": "3",
            "mention_ner": "PERSON",
            "mention_type": "HUM",
            "predicted_coref_chain": None,
            "score": -1.0,
            "sent_id": 0,
            "tokens_number": [3, 4],
            "tokens_str": "Tara Reid",
            "topic_id": "1ecb",
        },
    ]

    mentions = list()
    for json in mentions_json:
        mentions.append(MentionData.read_json_mention_data_line(json))

    return mentions