def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: transformer = SentenceTransformer(self._pretrained_model_name) index = {} mat_embeddings = [] # List[np.array] # detokenize each sentence and run it through the sentence BERT transformer, unless it's already cached for (doc_id, sent_idx), df in tqdm( dataset.tokens.groupby([DOCUMENT_ID, SENTENCE_IDX]), desc="Obtaining sentence BERT embeddings", mininterval=10): detok_sentence = self._detokenizer(df[TOKEN].values.tolist()) if not detok_sentence in self._cache: embedded_sentence = transformer.encode([detok_sentence], show_progress_bar=False, batch_size=1)[0] self._cache[detok_sentence] = embedded_sentence else: embedded_sentence = self._cache[detok_sentence] index[(doc_id, sent_idx)] = len(index) mat_embeddings.append(embedded_sentence.astype(np.float16)) mat_embeddings = np.vstack(mat_embeddings) # and we're done sentence_embeddings = (index, mat_embeddings) dataset.set(SENTENCE_EMBEDDINGS, sentence_embeddings) return dataset
def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]], unique_mentions: Set[Tuple]): # obtain embeddings assert dataset.has(SENTENCE_EMBEDDINGS) sentence_embeddings = dataset.get( SENTENCE_EMBEDDINGS ) # type: Tuple[Dict[Tuple[str, int], int], np.array] embedding_index, embedding_mat = sentence_embeddings mentions_action = dataset.mentions_action # compute a mean embedding in case we need to pad somewhere mean_embedding = embedding_mat.mean(axis=0) # precompute embedding matrices for each action mention precomputed_sentence = {} precomputed_doc_start = {} for mention_idx in unique_mentions: assert len(mention_idx) == 2 doc_id, mention_id = mention_idx # look up sentence embedding of the sentence containing the action mention sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX] surrounding_sent_embedding = embedding_mat[embedding_index[( doc_id, sent_idx_of_action)]] # for the document start, take n sentences from the start of the document and concatenate their embeddings NUM_SENTENCES_DOC_START = 3 doc_start_sent_embeddings = [] for i in range(NUM_SENTENCES_DOC_START): # there might be documents shorter than NUM_SENTENCES_DOC_START, therefore check: if there are not # enough sentences, pad with the mean embedding if (doc_id, i) in embedding_index: sent_embedding = embedding_mat[embedding_index[(doc_id, i)]] else: sent_embedding = mean_embedding doc_start_sent_embeddings.append(sent_embedding) doc_start_embedding = np.hstack(doc_start_sent_embeddings) precomputed_sentence[mention_idx] = surrounding_sent_embedding precomputed_doc_start[mention_idx] = doc_start_embedding feature_columns = [] for vectors, feature_desc in [(precomputed_sentence, SURROUNDING_SENTENCE), (precomputed_doc_start, DOC_START)]: feature_column = batch_cosine_similarity( pairs, vectors, desc=f"{self.name} {feature_desc}") feature_columns.append(feature_column) feature_matrix = np.hstack(feature_columns) return feature_matrix
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: dbpedia = live_objects[DBPEDIA] # type: DbPedia locations = dataset.mentions_location assert locations is not None and DBPEDIA_URI in locations.columns, "Need to entity link locations to DBpedia first!" linked_locations = locations.loc[locations[DBPEDIA_URI].notna(), DBPEDIA_URI] # look up coordinates, then reindex to make indices match tqdm.pandas(desc="Look up locations on DBpedia") with_coordinates = linked_locations.progress_apply( lambda uri: self._look_up_coordinates(uri, dbpedia)) with_coordinates_reindexed = with_coordinates.reindex(locations.index) # look up geographic hierarchy, then reindex to make indices match tqdm.pandas(desc="Look up geographic hierarchy on DBpedia") with_hierarchy = linked_locations.progress_apply( lambda uri: self._look_up_geographic_hierarchy(uri, dbpedia)) with_hierarchy_reindexed = with_hierarchy.reindex(locations.index) dataset.mentions_location = pd.concat( [locations, with_coordinates_reindexed, with_hierarchy_reindexed], axis=1) return dataset
def _load_dataset(self) -> Dataset: # load full dataset documents, contents, mentions = gvc_reader_utils.load_gvc_dataset( self._gvc_root_dir / "GVC_gold.conll", doc_to_subtopic_file=self._gvc_root_dir / "gvc_doc_to_event.csv") # look up the events for this split and which documents belong to which event, then combine the two into the # documents which are part of this split split = pd.read_csv(self._gvc_split_csv, index_col=0, header=None, names=[EVENT_ID]) docs_of_split = documents.loc[documents.index.get_level_values( SUBTOPIC).isin(split[EVENT_ID].astype(str))].set_index(DOCUMENT_ID) # return only instances of this split documents = documents.loc[documents[DOCUMENT_ID].isin( docs_of_split.index)].sort_index() contents = contents.loc[docs_of_split.index].sort_index() mentions_action = mentions.loc[docs_of_split.index].sort_index() if self._drop_0_cluster: mentions_action = mentions_action.loc[mentions_action[EVENT] != 0] dataset = Dataset(documents, contents, mentions_action) return dataset
def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]], unique_mentions: Set[Tuple]): # obtain embeddings assert dataset.has(ACTION_PHRASE_EMBEDDINGS) action_phrase_embeddings = dataset.get( ACTION_PHRASE_EMBEDDINGS ) # type: Tuple[Dict[Tuple[str, int], int], np.array] embedding_index, embedding_mat = action_phrase_embeddings pairs_transform = lambda idx: embedding_index[idx] feature_column = batch_cosine_similarity( pairs, embedding_mat, pairs_transform=pairs_transform, desc=self.name) feature_matrix = feature_column.reshape((-1, 1)) return feature_matrix
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: docs = dataset.documents # select subset of topics if self._num_topics is not None: actual_num_topics = len(docs.index.unique(TOPIC_ID)) if self._num_topics > actual_num_topics: raise ValueError( f"This dataset only has {actual_num_topics} topics, but you asked for a subset of {self._num_topics} topics." ) topics_to_use = docs.index.unique(TOPIC_ID).to_series().sample( self._num_topics, random_state=0).values selected_docs = docs.loc[docs.index.get_level_values( TOPIC_ID).isin(topics_to_use)] else: selected_docs = docs # select subset of documents per topic if self._num_docs_per_topic is not None: selected_docs = selected_docs.groupby( TOPIC_ID, as_index=False).apply(lambda df: df.sample( min(len(df), self._num_docs_per_topic), random_state=0)) selected_docs.index = selected_docs.index.droplevel(0) selected_docs.sort_index(inplace=True) self.logger.warning( f"Number of documents limited to {len(selected_docs)}!") dataset.documents = selected_docs selected_doc_ids = dataset.documents[DOCUMENT_ID] dataset.tokens = dataset.tokens.loc[ dataset.tokens.index.get_level_values(DOCUMENT_ID).isin( selected_doc_ids)] dataset.mentions_action = dataset.mentions_action.loc[ dataset.mentions_action.index.get_level_values(DOCUMENT_ID).isin( selected_doc_ids)] if dataset.mentions_time is not None: dataset.mentions_time = dataset.mentions_time.loc[ dataset.mentions_time.index.get_level_values(DOCUMENT_ID).isin( selected_doc_ids)] if dataset.mentions_location is not None: dataset.mentions_location = dataset.mentions_location.loc[ dataset.mentions_location.index.get_level_values( DOCUMENT_ID).isin(selected_doc_ids)] if dataset.mentions_participants is not None: dataset.mentions_participants = dataset.mentions_participants.loc[ dataset.mentions_participants.index.get_level_values( DOCUMENT_ID).isin(selected_doc_ids)] if dataset.mentions_other is not None: dataset.mentions_other = dataset.mentions_other.loc[ dataset.mentions_other.index.get_level_values( DOCUMENT_ID).isin(selected_doc_ids)] return dataset
def _load_dataset(self) -> Dataset: self.logger.info("Reading raw data") # load full dataset tuples = football_reader_utils.read_split_data( self._sentence_level_data_dir, token_level_data_dir=self._token_level_data_dir) assert len(tuples) == 2, "Token-level extensions are mandatory" sentence_level_data, token_level_data = tuples # create Dataset object from sentence-level annotated data documents, tokens, _, _, _ = sentence_level_data mentions_action, mentions_participants, mentions_time, mentions_location, semantic_roles = token_level_data if self._drop_other_event_cluster: mentions_action = mentions_action.loc[ mentions_action[EVENT] != "other_event"] # We may now have some documents which do not contain any mentions. Remove those to avoid trouble in the # model code later on. documents_without_mentions = set( documents[DOCUMENT_ID].unique()) - set( mentions_action.index.get_level_values(DOCUMENT_ID).unique()) documents = documents.loc[~documents[DOCUMENT_ID]. isin(documents_without_mentions)] tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index() mentions_participants = mentions_participants.loc[ ~mentions_participants.index.get_level_values(DOCUMENT_ID). isin(documents_without_mentions)] mentions_location = mentions_location.loc[ ~mentions_location.index.get_level_values(DOCUMENT_ID). isin(documents_without_mentions)] mentions_time = mentions_time.loc[~mentions_time.index. get_level_values(DOCUMENT_ID). isin(documents_without_mentions)] semantic_roles = semantic_roles.loc[~semantic_roles[DOCUMENT_ID]. isin(documents_without_mentions)] documents.sort_index(inplace=True) tokens.sort_index(inplace=True) mentions_action.sort_index(inplace=True) mentions_participants.sort_index(inplace=True) mentions_time.sort_index(inplace=True) mentions_location.sort_index(inplace=True) dataset = Dataset(documents, tokens, mentions_action, mentions_time=mentions_time, mentions_location=mentions_location, mentions_participants=mentions_participants, semantic_roles=semantic_roles) return dataset
def _load_dataset(self) -> Dataset: self.logger.info("Reading raw data") # load full dataset tuples = football_reader_utils.read_split_data( self._sentence_level_data_dir) # create Dataset object from sentence-level annotated data documents, tokens, mentions_action, _, _ = tuples[0] # add token indices for action mentions, so that the format of this dataframe matches that of the other corpora max_token_index_in_sentence = tokens.index.to_frame( index=False).groupby([DOCUMENT_ID, SENTENCE_IDX])[TOKEN_IDX].max() mentions_action_with_max_token = mentions_action.reset_index().merge( max_token_index_in_sentence, on=[DOCUMENT_ID, SENTENCE_IDX]).rename(columns={TOKEN_IDX: TOKEN_IDX_TO}) mentions_action_with_max_token[ TOKEN_IDX_TO] += 1 # remember, we use exclusive span boundaries mentions_action_with_max_token[TOKEN_IDX_FROM] = 0 mentions_action = mentions_action_with_max_token.set_index( [DOCUMENT_ID, MENTION_ID]) # We may now have some documents which do not contain any mentions. Remove those to avoid trouble in the # model code later on. documents_without_mentions = set( documents[DOCUMENT_ID].unique()) - set( mentions_action.index.get_level_values(DOCUMENT_ID).unique()) documents = documents.loc[~documents[DOCUMENT_ID]. isin(documents_without_mentions)] tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index() documents.sort_index(inplace=True) tokens.sort_index(inplace=True) mentions_action.sort_index(inplace=True) dataset = Dataset(documents, tokens, mentions_action) return dataset
def _load_dataset(self) -> Dataset: self.logger.info("Reading raw data") documents, tokens, mentions, entities_events = ecb_reader_utils.read_split_data( self._path_to_data_split, self._sentence_filter_csv) # remove invalid cross-sentence mentions - there is for example one in 36_4ecbplus mentions_valid = mentions.loc[ mentions[TOKEN_IDX_FROM] < mentions[TOKEN_IDX_TO]] if len(mentions_valid) < len(mentions): self.logger.warning( f"Removed {len(mentions) - len(mentions_valid)} invalid mention(s) present in the gold data." ) mentions = mentions_valid # in 41_4ecb there is a participant mention with type "HUMAN_PART" which should be "HUMAN_PART_GPE" mentions[MENTION_TYPE] = mentions[MENTION_TYPE].replace( {"HUMAN_PART": HUMAN_PART_GPE}) if self._topics_to_load is not None: # perform topic selection topics_to_load = {str(v) for v in self._topics_to_load} topics_in_split = set( documents.index.get_level_values(TOPIC_ID).unique()) topics_not_present = topics_to_load - topics_in_split if topics_not_present: self.logger.warning( f"Cannot load these topics because they are not part of the split: {', '.join(sorted(topics_not_present))}" ) topics_to_load = list(topics_in_split & topics_to_load) if not topics_to_load: raise ValueError("At least one topic has to be selected") self.logger.info( f"Using topic(s) {', '.join(sorted(topics_to_load))}") # subselect documents = documents.loc[list(topics_to_load)].sort_index() tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index() mentions = mentions.loc[documents[DOCUMENT_ID]].sort_index() # obtain action mentions mentions_action = mentions.loc[mentions[MENTION_TYPE].isin( MENTION_TYPES_ACTION)].copy() # remove documents which contain no action mentions documents_without_mentions = set( documents[DOCUMENT_ID].unique()) - set( mentions_action.index.get_level_values(DOCUMENT_ID).unique()) if documents_without_mentions: self.logger.info( f"The following documents contain no action mentions and were removed: {', '.join(sorted(documents_without_mentions))}" ) documents = documents.loc[~documents[DOCUMENT_ID]. isin(documents_without_mentions)] tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index() mentions = mentions.loc[~mentions.index.get_level_values(DOCUMENT_ID). isin(documents_without_mentions)] # now divide the remainder of mentions mentions_time = mentions.loc[mentions[MENTION_TYPE].isin( MENTION_TYPES_TIME)].rename(columns={EVENT: ENTITY}) mentions_location = mentions.loc[mentions[MENTION_TYPE].isin( MENTION_TYPES_LOCATION)].rename(columns={EVENT: ENTITY}) mentions_participants = mentions.loc[mentions[MENTION_TYPE].isin( MENTION_TYPES_PARTICIPANTS)].rename(columns={EVENT: ENTITY}) assert len(mentions) == sum([ len(df) for df in [ mentions_action, mentions_time, mentions_location, mentions_participants ] ]) dataset = Dataset(documents, tokens, mentions_action, mentions_time=mentions_time, mentions_location=mentions_location, mentions_participants=mentions_participants) return dataset
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: WIKIDATA_NAMESPACE = "http://www.wikidata.org/entity/" # determine for which QIDs we need to look up embeddings set_of_wikidata_qids = set() for df in [ dataset.mentions_action, dataset.mentions_time, dataset.mentions_location, dataset.mentions_participants, dataset.mentions_other ]: if df is None: continue assert WIKIDATA_QID in df.columns, "Need to entity link against Wikidata first!" set_of_wikidata_qids |= set( df[WIKIDATA_QID].loc[df[WIKIDATA_QID].notna()].unique()) wikidata_iris = { f"<{WIKIDATA_NAMESPACE}{qid}>": qid for qid in set_of_wikidata_qids } # load the relevant embedding vectors: use mmap_mode="r" to not load gigabytes of stuff into RAM mat_embedding = np.load(self._embedding_matrix_file, mmap_mode="r") num_terms = mat_embedding.shape[0] # Check the JSON index to find the indices of these QIDs in the pretrained embedding matrix. Use ijson to parse # the file incrementally, avoiding to load 3GB of JSON into RAM. qid_to_mat_embedding_index = {} qid_to_mat_embedding_subset_index = {} with self._json_index_file.open("rb") as f: for i, term in tqdm(enumerate(ijson.items(f, "item")), desc="Looking up QIDs in embedding index", mininterval=10, total=num_terms, unit="terms"): try: unicode_term = term.encode().decode( "unicode_escape").strip() except UnicodeDecodeError as e: self.logger.warn(e) continue if unicode_term in wikidata_iris.keys(): qid = wikidata_iris.pop(unicode_term) qid_to_mat_embedding_index[qid] = i qid_to_mat_embedding_subset_index[qid] = len( qid_to_mat_embedding_subset_index) # bail early if done if not wikidata_iris: self.logger.info("All QIDs found!") break if wikidata_iris: self.logger.warning( f"The following {len(wikidata_iris)} Wikidata entities were not found in the pretrained embedding index:\n" + pformat(wikidata_iris)) # look up relevant embeddings mat_embedding_subset = mat_embedding[list( qid_to_mat_embedding_index.values())] # and we're done wikidata_embeddings = (qid_to_mat_embedding_subset_index, mat_embedding_subset) dataset.set(WIKIDATA_EMBEDDINGS, wikidata_embeddings) return dataset
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: errors = False mentions_el = [] time_of_last_query = 0 for doc_id, df in tqdm( dataset.tokens.groupby(DOCUMENT_ID), desc=f"EL with {self._entity_linker_name} on documents", mininterval=10): doc_conjoined = "".join(df[TOKEN].values) doc_detokenized = self._detokenizer(df[TOKEN].values.tolist()) # obtain response from entity linker: from cache if possible, otherwise create it fresh if not doc_detokenized in self._entity_linker_cache: now = time.time() try: # apply rate limiting: make sure at least self._wait_between_requests_seconds seconds are between each request time_to_sleep = max( 0, self._get_waiting_time_between_requests_seconds( live_objects) - (now - time_of_last_query)) time.sleep(time_to_sleep) response = self._query_entity_linker( doc_detokenized, live_objects) except (ValueError, HTTPError) as e: self.logger.error(f"Entity linking error for {doc_id}", e) errors = True continue finally: time_of_last_query = now self._entity_linker_cache[doc_detokenized] = response else: response = self._entity_linker_cache[doc_detokenized] if response is None: self.logger.info(f"No entities found for {doc_id}.") continue response_df = self._convert_el_response_to_dataframe( response, live_objects) # we first need to map the detokenized character offsets into our tokenized character offsets get_alignment = get_monotonous_character_alignment_func( doc_conjoined, doc_detokenized) response_df[CHARS_START] = response_df[CHARS_START].map( get_alignment) # we need to work around exclusive span boundaries here response_df[CHARS_END] = (response_df[CHARS_END] - 1).map(get_alignment) + 1 # now, we need to move from character offsets to tokens: # start offsets: the first token is associated with character 0, the second token with len(token[0]) and so on token_start_offsets = df[TOKEN].str.len().cumsum().shift( 1, fill_value=0) response_df = response_df.merge(token_start_offsets.reset_index(), left_on=CHARS_START, right_on=TOKEN) response_df = response_df.drop( columns=[CHARS_START, TOKEN, SENTENCE_IDX, DOCUMENT_ID ]).rename(columns={TOKEN_IDX: TOKEN_IDX_FROM}) # end offsets: We work with exclusive boundaries. If a mention lies at the end of a sentence, then its # TOKEN_IDX_TO needs to be +1 the index of the last token in the sentence (basically going out of bounds). token_end_offsets = df[TOKEN].str.len().cumsum() response_df = response_df.merge(token_end_offsets.reset_index(), left_on=CHARS_END, right_on=TOKEN) response_df = response_df.drop(columns=[CHARS_END, TOKEN]).rename( columns={TOKEN_IDX: TOKEN_IDX_TO}) response_df[TOKEN_IDX_TO] = response_df[ TOKEN_IDX_TO] + 1 # here we +1 the token index for correct exclusive boundaries # final dataframe format: # - index: doc_id, mention_id # - values: all the things we want to keep: support, types, similarityScore, percentageOfSecondsRank, dbpedia-uri mentions_el_in_doc = response_df.reset_index().rename( columns={ "index": MENTION_ID }).set_index([DOCUMENT_ID, MENTION_ID]) mentions_el.append(mentions_el_in_doc) if errors: raise ValueError( "Stopping because there were errors in the process.") mentions_el = pd.concat(mentions_el) # remove invalid spans, if any exist TODO fix the actual problem which is causing them mentions_el_valid = mentions_el.loc[ mentions_el[TOKEN_IDX_FROM] < mentions_el[TOKEN_IDX_TO]] if len(mentions_el_valid) < len(mentions_el): self.logger.warning( f"Removed {len(mentions_el) - len(mentions_el_valid)} invalid mention(s) after DBpedia entity linking" ) mentions_el = mentions_el_valid if not self.mode in [MODE_INTERSECT, MODE_EXTEND]: raise ValueError # set coarse entity type for each predicted entity mention coarse_type_to_dbpedia_type = { ACTION: "DBpedia:Event", PARTICIPANTS: "DBpedia:Agent", LOCATION: "DBpedia:Place", TIME: "DBpedia:TimePeriod" } for coarse_type, dbo_type in coarse_type_to_dbpedia_type.items(): mentions_el.loc[mentions_el["types"].str.contains(dbo_type), MENTION_TYPE_COARSE] = coarse_type mentions_el[MENTION_TYPE_COARSE] = mentions_el[ MENTION_TYPE_COARSE].fillna(OTHER) # Enrich all gold mentions with new info from entity linking coarse_type_to_dataset_attr = { ACTION: "mentions_action", PARTICIPANTS: "mentions_participants", LOCATION: "mentions_location", TIME: "mentions_time", OTHER: "mentions_other" } for coarse_type, attr in sorted(coarse_type_to_dataset_attr.items()): mentions_el_of_coarse_type = mentions_el.loc[ mentions_el[MENTION_TYPE_COARSE] == coarse_type].drop( columns=MENTION_TYPE_COARSE) # If the dataset did not contain any mentions of this type, simply assign all predicted mentions. Otherwise # left-join all the new columns produced by the entity linking to the gold annotations. We make sure only # to join entities which match the type of the gold annotations. Otherwise "The Real Housewives of Beverly # Hills" will be joined to "in Beverly Hills", which causes more trouble than necessary. dataset_mentions = getattr(dataset, attr, None) if dataset_mentions is None: new_dataset_mentions = mentions_el_of_coarse_type else: columns_keep_gold = dataset_mentions.columns columns_keep_system = mentions_el.columns.drop([ TOKEN_IDX_FROM, TOKEN_IDX_TO, SENTENCE_IDX, MENTION_TEXT, MENTION_TYPE_COARSE ]) new_dataset_mentions = left_join_predictions( dataset_mentions, mentions_el_of_coarse_type, columns_keep_gold, columns_keep_system) setattr(dataset, attr, new_dataset_mentions) if self.mode == MODE_INTERSECT: self.logger.info( "Intersected new annotations with dataset from previous pipeline stages." ) elif self.mode == MODE_EXTEND: self.logger.info( "Extending dataset entities with those found during entity linking..." ) # add all non-overlapping mentions found via entity linking to the dataset mentions_el_to_add = outer_join_predictions(mentions_el, dataset).copy() for coarse_type, attr in coarse_type_to_dataset_attr.items(): # skipping the extension for actions is of crucial importance here, otherwise we would be adding # additional event mentions to the dataset! if coarse_type == ACTION: continue mentions_el_to_add_of_coarse_type = mentions_el_to_add.loc[ mentions_el_to_add[MENTION_TYPE_COARSE] == coarse_type].drop(columns=MENTION_TYPE_COARSE) dataset_mentions = getattr(dataset, attr, None) assert dataset_mentions is not None # this can't be since we must have assigned something in the similar loop above new_dataset_mentions = pd.concat( [dataset_mentions, mentions_el_to_add_of_coarse_type]).sort_index() setattr(dataset, attr, new_dataset_mentions) # assert that there are no "backwards spans", this has caused issues way too many times... for attr in coarse_type_to_dataset_attr.values(): mentions_df = getattr(dataset, attr) assert mentions_df.loc[ mentions_df[TOKEN_IDX_FROM] >= mentions_df[TOKEN_IDX_TO]].empty # make sure to add the mention text to each mention def get_mention_text_from_mention(row: pd.Series) -> str: return " ".join(dataset.tokens.loc[( row.name[0], row[SENTENCE_IDX], slice(row[TOKEN_IDX_FROM], row[TOKEN_IDX_TO] - 1)), TOKEN].values) dataset.mentions_action[MENTION_TEXT] = dataset.mentions_action.apply( get_mention_text_from_mention, axis=1) dataset.mentions_participants[ MENTION_TEXT] = dataset.mentions_participants.apply( get_mention_text_from_mention, axis=1) dataset.mentions_time[MENTION_TEXT] = dataset.mentions_time.apply( get_mention_text_from_mention, axis=1) dataset.mentions_location[ MENTION_TEXT] = dataset.mentions_location.apply( get_mention_text_from_mention, axis=1) if dataset.mentions_other is not None: dataset.mentions_other[ MENTION_TEXT] = dataset.mentions_other.apply( get_mention_text_from_mention, axis=1) return dataset
def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]], unique_mentions: Set[Tuple]): # obtain embeddings assert dataset.has(WIKIDATA_EMBEDDINGS) wikidata_embeddings = dataset.get(WIKIDATA_EMBEDDINGS) # type: Tuple[Dict[str, int], np.array] embedding_index, embedding_mat = wikidata_embeddings # create one large dataframe of all named entities which are entity linked to Wikidata linked_event_components = [] for mention_type_coarse, df in {ACTION: dataset.mentions_action, PARTICIPANTS: dataset.mentions_participants, TIME: dataset.mentions_time, LOCATION: dataset.mentions_location, OTHER: dataset.mentions_other}.items(): if df is None: continue # keep only entities/mentions which are linked to Wikidata linked_subset = df.loc[df[WIKIDATA_QID].notna()] # drop those linked embeddings for which we don't have an embedding with_embedding = linked_subset.loc[linked_subset[WIKIDATA_QID].isin(embedding_index.keys())] # keep only relevant columns only_relevant_columns = with_embedding.reindex(columns=[MENTION_TEXT, SENTENCE_IDX, WIKIDATA_QID]) only_relevant_columns[MENTION_TYPE_COARSE] = mention_type_coarse linked_event_components.append(only_relevant_columns) linked_event_components = pd.concat(linked_event_components).set_index(MENTION_TYPE_COARSE, append=True) assert linked_event_components.index.is_unique # convert QID into index of the corresponding embedding in `embedding_mat` linked_event_components[WIKIDATA_QID] = linked_event_components[WIKIDATA_QID].map(embedding_index) assert linked_event_components[WIKIDATA_QID].notna().all() and not linked_event_components[WIKIDATA_QID].astype( str).str.startswith("Q").any() linked_event_components = linked_event_components.reset_index() mentions_action = dataset.mentions_action sr = dataset.semantic_roles # precompute embedding matrices for each action mention precomputed = {} for mention_idx in unique_mentions: assert len(mention_idx) == 2 doc_id, mention_id = mention_idx linked_in_doc = linked_event_components.loc[linked_event_components[DOCUMENT_ID] == doc_id] # look up embedding for action mention (rarely the case) linked_action_mention = linked_in_doc.loc[ (linked_in_doc[MENTION_TYPE_COARSE] == ACTION) & (linked_in_doc[MENTION_ID] == mention_id)] if not linked_action_mention.empty: action_mention_embedding = embedding_mat[linked_action_mention[WIKIDATA_QID].values] else: action_mention_embedding = None # if available, create matrix of embeddings from all entity linked SRL arguments srl_args_of_mention = sr.loc[(sr[DOCUMENT_ID] == doc_id) & (sr[MENTION_ID] == mention_id)] if not srl_args_of_mention.empty: linked_srl_args_for_mention = srl_args_of_mention.merge(linked_in_doc, left_on=[COMPONENT_MENTION_ID, MENTION_TYPE_COARSE], right_on=[MENTION_ID, MENTION_TYPE_COARSE]).drop_duplicates(WIKIDATA_QID) linked_srl_embeddings = embedding_mat[linked_srl_args_for_mention[WIKIDATA_QID].values] else: linked_srl_embeddings = None # create matrix of embeddings from all linked entities in the same sentence as the action mention sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX] linked_in_surrounding_sent = linked_in_doc.loc[ linked_in_doc[SENTENCE_IDX] == sent_idx_of_action].drop_duplicates(WIKIDATA_QID) if not linked_in_surrounding_sent.empty: surrounding_sent_embeddings = embedding_mat[linked_in_surrounding_sent[WIKIDATA_QID].values] else: surrounding_sent_embeddings = None # create matrix of embeddings from all linked entities in the context of the action mention NUM_SENTENCES_CONTEXT = 2 sent_idx_from = sent_idx_of_action - NUM_SENTENCES_CONTEXT sent_idx_to = sent_idx_of_action + NUM_SENTENCES_CONTEXT linked_in_context = linked_in_doc.loc[(linked_in_doc[SENTENCE_IDX] >= sent_idx_from) & ( linked_in_doc[SENTENCE_IDX] <= sent_idx_to)].drop_duplicates(WIKIDATA_QID) if not linked_in_context.empty: context_embeddings = embedding_mat[linked_in_context[WIKIDATA_QID].values] else: context_embeddings = None # create matrix of embeddings from linked entities at the document start NUM_SENTENCES_DOC_START = 3 linked_at_doc_start = linked_in_doc.loc[ (linked_in_doc[SENTENCE_IDX] < NUM_SENTENCES_DOC_START)].drop_duplicates(WIKIDATA_QID) if not linked_at_doc_start.empty: doc_start_embeddings = embedding_mat[linked_at_doc_start[WIKIDATA_QID].values] else: doc_start_embeddings = None precomputed[mention_idx] = {ACTION_MENTION: action_mention_embedding, SEMANTIC_ROLE_ARGS: linked_srl_embeddings, SURROUNDING_SENTENCE: surrounding_sent_embeddings, SENTENCE_CONTEXT: context_embeddings, DOC_START: doc_start_embeddings} # using the precomputed action mention representations, compute pairwise features list_of_instance_features = [] for pair in pairs: a_idx, b_idx = pair instance_features = [] # compute distance between action mention embeddings a_action_mention_mat = precomputed[a_idx][ACTION_MENTION] b_action_mention_mat = precomputed[b_idx][ACTION_MENTION] if a_action_mention_mat is None or b_action_mention_mat is None: instance_features.append(None) else: instance_features.append(cosine(a_action_mention_mat, b_action_mention_mat)) # the order is important here, it has to match the names in __init__! for key in FEATURES_IN_ORDER: a_mat = precomputed[a_idx][key] b_mat = precomputed[b_idx][key] features_of_key = compute_pairwise_embedding_distance_features(a_mat, b_mat) instance_features += features_of_key instance_features = np.array(instance_features, dtype=self.dtype) list_of_instance_features.append(instance_features) feature_matrix = np.vstack(list_of_instance_features) return feature_matrix
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: # masking an event component entails replacing all mention tokens with a random dummy token, followed by # nulling all additional preprocessing columns to keep features from working off of those if "action" in self._mask_what: mentions_action = dataset.mentions_action tokens = self._mask_tokens(dataset.tokens, mentions_action) mentions_action = self._fill_columns_with_na( mentions_action, [DBPEDIA_URI, WIKIDATA_QID]) dataset.mentions_action = mentions_action dataset.tokens = tokens if "participants" in self._mask_what: mentions_participants = dataset.mentions_participants semantic_roles = dataset.semantic_roles tokens = self._mask_tokens(dataset.tokens, mentions_participants) # remove all participant mentions and corresponding SRL entries mentions_participants = mentions_participants.iloc[0:0] semantic_roles = semantic_roles.loc[ semantic_roles[MENTION_TYPE_COARSE] != PARTICIPANTS] dataset.mentions_participants = mentions_participants dataset.semantic_roles = semantic_roles dataset.tokens = tokens if "location" in self._mask_what: mentions_location = dataset.mentions_location semantic_roles = dataset.semantic_roles tokens = self._mask_tokens(dataset.tokens, mentions_location) # remove all location mentions and corresponding SRL entries mentions_location = mentions_location.iloc[0:0] semantic_roles = semantic_roles.loc[ semantic_roles[MENTION_TYPE_COARSE] != LOCATION] dataset.mentions_location = mentions_location dataset.semantic_roles = semantic_roles dataset.tokens = tokens if "time" in self._mask_what: mentions_time = dataset.mentions_time semantic_roles = dataset.semantic_roles tokens = self._mask_tokens(dataset.tokens, mentions_time) # remove all temporal mentions and corresponding SRL entries mentions_time = mentions_time.iloc[0:0] semantic_roles = semantic_roles.loc[ semantic_roles[MENTION_TYPE_COARSE] != TIME] dataset.mentions_time = mentions_time dataset.semantic_roles = semantic_roles dataset.tokens = tokens if "publish_date" in self._mask_what: documents = dataset.documents if PUBLISH_DATE in documents.columns: documents.drop(columns=PUBLISH_DATE, inplace=True) dataset.documents = documents return dataset
def _process_dataset(self, dataset: Dataset, live_objects: Dict) -> Dataset: semantic_roles = [] # determine sentences with action mentions for doc_id, mentions_action_doc in tqdm(dataset.mentions_action.groupby(DOCUMENT_ID), desc="SRL on documents", mininterval=10): # skip documents for which we already have semantic roles if dataset.semantic_roles is not None and doc_id in dataset.semantic_roles[DOCUMENT_ID]: continue for sent_idx, mentions_action_sent in mentions_action_doc.groupby(SENTENCE_IDX): # run SRL: # AllenNLP SRL models can accept a tokenized sentence and one verbal predicate and returns argument class probabilities # per token, which can be converted to BIO via Viterbi. Notably, there is no possibility to feed in pre-recognized # argument spans, so the spans recognized by the model need to be reconciled manually. Also, only _verbal_ predicates # are supported. tokenized_sentence = dataset.tokens.loc[(doc_id, sent_idx), TOKEN].values # predict SRL or obtain from cache if tokenized_sentence not in self._cache: srl_prediction = self._srl_predictor.predict_tokenized(tokenized_sentence) self._cache[tokenized_sentence] = srl_prediction else: srl_prediction = self._cache[tokenized_sentence] # srl_spans: for each verbal predicate in the sentence, a list of tags and their span srl_spans = [] # type: List[List[Tuple[str, Tuple[int, int]]]] for predicate in srl_prediction["verbs"]: tag_spans_inclusive = bio_tags_to_spans(predicate["tags"]) # switch from inclusive span boundaries to exclusive ones tag_spans = [(tag, (start, end + 1)) for (tag, (start, end)) in tag_spans_inclusive] srl_spans.append(tag_spans) # (start, end) token indices of each detected verb and preannotated actions in the current sentence srl_verb_spans = [(start, end) for predicate_spans in srl_spans for (tag, (start, end)) in predicate_spans if tag == "V"] mention_action_spans = mentions_action_sent[[TOKEN_IDX_FROM, TOKEN_IDX_TO]].values.tolist() # Map verbs returned from SRL to action mentions via sentence position: We have n pre-annotated action # mentions and m predicates found by SRL. We want to find the best 1:1 assignment from predicate to # mention. We approach this as a linear assignment problem. map_from_preannotated_action_to_srl_predicate = span_matching(mention_action_spans, srl_verb_spans) # for those where mapping exists: for i_action, i_predicate in map_from_preannotated_action_to_srl_predicate.items(): action = mentions_action_sent.iloc[i_action] action_mention_id = action.name[mentions_action_sent.index.names.index(MENTION_ID)] tag_spans = srl_spans[i_predicate] # map time, location, participants to annotations event_component_rows = [] def find_event_component_mapping(mentions_df: pd.DataFrame, srl_target_tags: List[str], coarse_mention_type: str): # it can happen that there is no time/location/participant annotated in a sentence; otherwise, # look up mentions in the sentence if not doc_id in mentions_df.index or not sent_idx in mentions_df.loc[doc_id, SENTENCE_IDX]: return mentions_within_doc = mentions_df.loc[doc_id] mentions_within_sentence = mentions_within_doc.loc[mentions_within_doc[SENTENCE_IDX] == sent_idx] mention_spans_within_sentence = mentions_within_sentence[[TOKEN_IDX_FROM, TOKEN_IDX_TO]].values.tolist() _srl_spans = [(start, end) for (tag, (start, end)) in tag_spans if tag in srl_target_tags] mapping = span_matching(mention_spans_within_sentence, _srl_spans) for idx_mention, idx_srl in mapping.items(): # 'name' is the only remaining index column here, which is MENTION_ID mapped_mention_id = mentions_within_sentence.iloc[idx_mention].name row = {MENTION_TYPE_COARSE: coarse_mention_type, COMPONENT_MENTION_ID: mapped_mention_id} event_component_rows.append(row) find_event_component_mapping(dataset.mentions_location, ["ARGM-DIR", "ARGM-LOC"], LOCATION) find_event_component_mapping(dataset.mentions_time, ["ARGM-TMP"], TIME) find_event_component_mapping(dataset.mentions_participants, ["ARG0", "ARG1"], PARTICIPANTS) # Collect it all in a dataframe: # For each action mention: # - index-y (not an actual index): doc-id, mention-id (this is the action mention id), sent_idx <<-- redundant # - columns: mention-type-coarse, component-mention-id (the mention associated with its action, and the mention type) if event_component_rows: event_components = pd.DataFrame(event_component_rows) event_components[DOCUMENT_ID] = doc_id event_components[MENTION_ID] = action_mention_id semantic_roles.append(event_components) if len(semantic_roles) == 0: raise ValueError("No semantic roles found. Possible reasons: (1) Dataset already has semantic roles defined. (2) Pretrained SRL predictor likely does not match allennlp version! Check project README for details.") # merge identified event components of each sentence and mention into one dataframe semantic_roles = pd.concat(semantic_roles, sort=True) # concatenate with existing roles if dataset.semantic_roles is not None: semantic_roles = pd.concat([semantic_roles, dataset.semantic_roles], ignore_index=True) dataset.semantic_roles = semantic_roles return dataset