def arrange_resource(wd_mentions_json): document_tokens_dict = dict() for mention_json in wd_mentions_json: mention_data = MentionData.read_json_mention_data_line( mention_json) mention_tokens = mention_data.tokens_number for i in range(0, len(mention_tokens)): doc_id = mention_data.doc_id sent_id = mention_data.sent_id token_map_key = MentionData.static_gen_token_unique_id( doc_id, sent_id, mention_tokens[i]) document_tokens_dict[token_map_key] = mention_data.coref_chain return document_tokens_dict
def load_mentions_vocab_from_files(mentions_files, filter_stop_words=False): logger.info("Loading mentions files...") mentions = [] for _file in mentions_files: mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file)) return load_mentions_vocab(mentions, filter_stop_words)
def order_mentions_by_topics(self, mentions: str) -> List[Topic]: """ Order mentions to documents topics Args: mentions: json mentions file Returns: List[Topic] of the mentions separated by their documents topics """ running_index = 0 topics = [] current_topic_ref = None for mention_line in mentions: mention = MentionData.read_json_mention_data_line(mention_line) if self.keep_order: if mention.mention_index == -1: mention.mention_index = running_index running_index += 1 topic_id = mention.topic_id if not current_topic_ref or len(topics) > 0 and topic_id != topics[-1].topic_id: current_topic_ref = Topic(topic_id) topics.append(current_topic_ref) current_topic_ref.mentions.append(mention) return topics
def load_mentions_vocab(mentions_files, filter_stop_words=False): logger.info('Loading mentions files...') mentions = [] logger.info('Done loading mentions files, starting local dump creation...') for _file in mentions_files: mentions.extend( MentionData.read_mentions_json_to_mentions_data_list(_file)) return extract_vocab(mentions, filter_stop_words)
def load_mentions_vocab(mentions_files, filter_stop_words=False): logger.info('Loading mentions files...') mentions = [] for _file in mentions_files: mentions.extend( MentionData.read_mentions_json_to_mentions_data_list(_file)) vocab = extract_vocab(mentions, filter_stop_words) logger.info('Done loading mentions files...') return vocab
def extract_within_coref(self, mention: MentionData) -> List[str]: tokens = mention.tokens_number within_coref_token = [] for token_id in tokens: token_x_id = MentionData.static_gen_token_unique_id( str(mention.doc_id), str(mention.sent_id), str(token_id)) if token_x_id in self.within_doc_coref_chain: token_coref_chain = self.within_doc_coref_chain[token_x_id] if token_coref_chain: within_coref_token.append(token_coref_chain) else: within_coref_token.append("-") break return within_coref_token
def wordnet_dump(): out_file = args.output mentions_file = args.mentions logger.info('Loading mentions files...') mentions = MentionData.read_mentions_json_to_mentions_data_list(mentions_file) logger.info('Done loading mentions files, starting local dump creation...') result_dump = dict() wordnet = WordnetOnline() for mention in mentions: page = wordnet.get_pages(mention) result_dump[page.orig_phrase] = page with open(out_file, 'w') as out: json.dump(result_dump, out, default=json_dumper) logger.info('Wordnet Dump Created Successfully, ' 'extracted total of %d wn pages', len(result_dump)) logger.info('Saving dump to file-%s', out_file)
def order_mentions_by_topics(mentions: str) -> List[Topic]: """ Order mentions to documents topics Args: mentions: json mentions file Returns: List[Topic] of the mentions separated by their documents topics """ topics = [] current_topic_ref = None for mention_line in mentions: mention = MentionData.read_json_mention_data_line(mention_line) topic_id = mention.topic_id if not current_topic_ref or len( topics) > 0 and topic_id != topics[-1].topic_id: current_topic_ref = Topic(topic_id) topics.append(current_topic_ref) current_topic_ref.mentions.append(mention) return topics
def elmo_dump(): out_file = args.output mention_files = list() if os.path.isdir(args.mentions): for (dirpath, _, files) in os.walk(args.mentions): for file in files: if file == '.DS_Store': continue mention_files.append(join(dirpath, file)) else: mention_files.append(args.mentions) mentions = [] for _file in mention_files: mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file)) elmo_ecb_embeddings = load_elmo_for_vocab(mentions) with open(out_file, 'wb') as f: pickle.dump(elmo_ecb_embeddings, f) logger.info('Saving dump to file-%s', out_file)
def get_wordnet_mentions(): mentions_json = [ { "mention_id": "0", "tokens_str": "play", "topic_id": "1ecb" }, { "mention_id": "1", "tokens_str": "game", "topic_id": "1ecb" }, { "mention_id": "2", "tokens_str": "Chair", "topic_id": "1ecb" }, ] mentions = list() for json in mentions_json: mentions.append(MentionData.read_json_mention_data_line(json)) return mentions
def get_compute_mentions(): mentions_json = [ { "mention_id": "0", "tokens_str": "Exact String", "topic_id": "1ecb" }, { "mention_id": "1", "tokens_str": "Exact Same Head String", "topic_id": "1ecb" }, { "mention_id": "2", "tokens_str": "Nothing", "topic_id": "1ecb" }, ] mentions = list() for json in mentions_json: mentions.append(MentionData.read_json_mention_data_line(json)) return mentions
def get_wiki_mentions(): mentions_json = [ { "mention_id": "0", "tokens_str": "Ellen DeGeneres", "topic_id": "1ecb" }, { "mention_id": "1", "tokens_str": "television host", "topic_id": "1ecb" }, { "mention_id": "2", "tokens_str": "Los Angeles", "topic_id": "1ecb" }, ] mentions = list() for json in mentions_json: mentions.append(MentionData.read_json_mention_data_line(json)) return mentions
def add_mention(self, mention: MentionData) -> None: if mention is not None: mention.predicted_coref_chain = self.coref_chain self.mentions.append(mention) self.cluster_strings.append(mention.tokens_str) self.mentions_corefs.add(mention.coref_chain)
def create_ment_id(mention_x: MentionData, mention_y: MentionData) -> str: return "_".join( [mention_x.get_mention_id(), mention_y.get_mention_id()])
def get_embedd_mentions(): mentions_json = [ { "coref_chain": "HUM16236184328979740", "doc_id": "0", "mention_context": [ "Perennial", "party", "girl", "Tara", "Reid", "checked", "herself", "into", "Promises", "Treatment", "Center", ",", "her", "rep", "told", "People", ".", ], "mention_head": "Reid", "mention_head_lemma": "reid", "mention_head_pos": "PROPN", "mention_id": "0", "mention_index": -1, "mention_ner": "PERSON", "mention_type": "HUM", "predicted_coref_chain": None, "score": -1.0, "sent_id": 0, "tokens_number": [3, 4], "tokens_str": "Tara Reid", "topic_id": "1ecb", }, { "coref_chain": "HUM16236184328979740", "doc_id": "1_12ecb.xml", "mention_context": [ "Tara", "Reid", "has", "checked", "into", "Promises", "Treatment", "Center", ",", "a", "prominent", "rehab", "clinic", "in", "Los", "Angeles", ".", ], "mention_head": "Reid", "mention_head_lemma": "reid", "mention_head_pos": "PROPN", "mention_id": "1", "mention_index": -1, "mention_ner": "PERSON", "mention_type": "HUM", "predicted_coref_chain": None, "score": -1.0, "sent_id": 1, "tokens_number": [0, 1], "tokens_str": "Tara Reid", "topic_id": "1ecb", }, { "coref_chain": "Singleton_LOC_8_1_12ecb", "doc_id": "1_12ecb.xml", "mention_context": [ "Tara", "Reid", "has", "checked", "into", "Promises", "Treatment", "Center", ",", "a", "prominent", "rehab", "clinic", "in", "Los", "Angeles", ".", ], "mention_head": "in", "mention_head_lemma": "in", "mention_head_pos": "ADP", "mention_id": "2", "mention_index": -1, "mention_ner": None, "mention_type": "LOC", "predicted_coref_chain": None, "score": -1.0, "sent_id": 1, "tokens_number": [13, 14, 15], "tokens_str": "in Los Angeles", "topic_id": "1ecb", }, { "coref_chain": "HUM16236184328979740", "doc_id": "0", "mention_context": None, "mention_head": "Reid", "mention_head_lemma": "reid", "mention_head_pos": "PROPN", "mention_id": "3", "mention_ner": "PERSON", "mention_type": "HUM", "predicted_coref_chain": None, "score": -1.0, "sent_id": 0, "tokens_number": [3, 4], "tokens_str": "Tara Reid", "topic_id": "1ecb", }, ] mentions = list() for json in mentions_json: mentions.append(MentionData.read_json_mention_data_line(json)) return mentions