コード例 #1
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:

        transformer = SentenceTransformer(self._pretrained_model_name)

        index = {}
        mat_embeddings = []  # List[np.array]

        # detokenize each sentence and run it through the sentence BERT transformer, unless it's already cached
        for (doc_id, sent_idx), df in tqdm(
                dataset.tokens.groupby([DOCUMENT_ID, SENTENCE_IDX]),
                desc="Obtaining sentence BERT embeddings",
                mininterval=10):
            detok_sentence = self._detokenizer(df[TOKEN].values.tolist())

            if not detok_sentence in self._cache:
                embedded_sentence = transformer.encode([detok_sentence],
                                                       show_progress_bar=False,
                                                       batch_size=1)[0]
                self._cache[detok_sentence] = embedded_sentence
            else:
                embedded_sentence = self._cache[detok_sentence]

            index[(doc_id, sent_idx)] = len(index)
            mat_embeddings.append(embedded_sentence.astype(np.float16))

        mat_embeddings = np.vstack(mat_embeddings)

        # and we're done
        sentence_embeddings = (index, mat_embeddings)
        dataset.set(SENTENCE_EMBEDDINGS, sentence_embeddings)

        return dataset
コード例 #2
0
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]],
                   unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(SENTENCE_EMBEDDINGS)
        sentence_embeddings = dataset.get(
            SENTENCE_EMBEDDINGS
        )  # type: Tuple[Dict[Tuple[str, int], int], np.array]
        embedding_index, embedding_mat = sentence_embeddings

        mentions_action = dataset.mentions_action

        # compute a mean embedding in case we need to pad somewhere
        mean_embedding = embedding_mat.mean(axis=0)

        # precompute embedding matrices for each action mention
        precomputed_sentence = {}
        precomputed_doc_start = {}
        for mention_idx in unique_mentions:
            assert len(mention_idx) == 2
            doc_id, mention_id = mention_idx

            # look up sentence embedding of the sentence containing the action mention
            sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX]
            surrounding_sent_embedding = embedding_mat[embedding_index[(
                doc_id, sent_idx_of_action)]]

            # for the document start, take n sentences from the start of the document and concatenate their embeddings
            NUM_SENTENCES_DOC_START = 3
            doc_start_sent_embeddings = []
            for i in range(NUM_SENTENCES_DOC_START):
                # there might be documents shorter than NUM_SENTENCES_DOC_START, therefore check: if there are not
                # enough sentences, pad with the mean embedding
                if (doc_id, i) in embedding_index:
                    sent_embedding = embedding_mat[embedding_index[(doc_id,
                                                                    i)]]
                else:
                    sent_embedding = mean_embedding
                doc_start_sent_embeddings.append(sent_embedding)
            doc_start_embedding = np.hstack(doc_start_sent_embeddings)

            precomputed_sentence[mention_idx] = surrounding_sent_embedding
            precomputed_doc_start[mention_idx] = doc_start_embedding

        feature_columns = []
        for vectors, feature_desc in [(precomputed_sentence,
                                       SURROUNDING_SENTENCE),
                                      (precomputed_doc_start, DOC_START)]:
            feature_column = batch_cosine_similarity(
                pairs, vectors, desc=f"{self.name} {feature_desc}")
            feature_columns.append(feature_column)
        feature_matrix = np.hstack(feature_columns)
        return feature_matrix
コード例 #3
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        dbpedia = live_objects[DBPEDIA]  # type: DbPedia

        locations = dataset.mentions_location
        assert locations is not None and DBPEDIA_URI in locations.columns, "Need to entity link locations to DBpedia first!"

        linked_locations = locations.loc[locations[DBPEDIA_URI].notna(),
                                         DBPEDIA_URI]

        # look up coordinates, then reindex to make indices match
        tqdm.pandas(desc="Look up locations on DBpedia")
        with_coordinates = linked_locations.progress_apply(
            lambda uri: self._look_up_coordinates(uri, dbpedia))
        with_coordinates_reindexed = with_coordinates.reindex(locations.index)

        # look up geographic hierarchy, then reindex to make indices match
        tqdm.pandas(desc="Look up geographic hierarchy on DBpedia")
        with_hierarchy = linked_locations.progress_apply(
            lambda uri: self._look_up_geographic_hierarchy(uri, dbpedia))
        with_hierarchy_reindexed = with_hierarchy.reindex(locations.index)

        dataset.mentions_location = pd.concat(
            [locations, with_coordinates_reindexed, with_hierarchy_reindexed],
            axis=1)
        return dataset
コード例 #4
0
    def _load_dataset(self) -> Dataset:
        # load full dataset
        documents, contents, mentions = gvc_reader_utils.load_gvc_dataset(
            self._gvc_root_dir / "GVC_gold.conll",
            doc_to_subtopic_file=self._gvc_root_dir / "gvc_doc_to_event.csv")

        # look up the events for this split and which documents belong to which event, then combine the two into the
        # documents which are part of this split
        split = pd.read_csv(self._gvc_split_csv,
                            index_col=0,
                            header=None,
                            names=[EVENT_ID])
        docs_of_split = documents.loc[documents.index.get_level_values(
            SUBTOPIC).isin(split[EVENT_ID].astype(str))].set_index(DOCUMENT_ID)

        # return only instances of this split
        documents = documents.loc[documents[DOCUMENT_ID].isin(
            docs_of_split.index)].sort_index()
        contents = contents.loc[docs_of_split.index].sort_index()
        mentions_action = mentions.loc[docs_of_split.index].sort_index()

        if self._drop_0_cluster:
            mentions_action = mentions_action.loc[mentions_action[EVENT] != 0]

        dataset = Dataset(documents, contents, mentions_action)
        return dataset
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]],
                   unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(ACTION_PHRASE_EMBEDDINGS)
        action_phrase_embeddings = dataset.get(
            ACTION_PHRASE_EMBEDDINGS
        )  # type: Tuple[Dict[Tuple[str, int], int], np.array]
        embedding_index, embedding_mat = action_phrase_embeddings

        pairs_transform = lambda idx: embedding_index[idx]

        feature_column = batch_cosine_similarity(
            pairs,
            embedding_mat,
            pairs_transform=pairs_transform,
            desc=self.name)
        feature_matrix = feature_column.reshape((-1, 1))
        return feature_matrix
コード例 #6
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        docs = dataset.documents

        # select subset of topics
        if self._num_topics is not None:
            actual_num_topics = len(docs.index.unique(TOPIC_ID))
            if self._num_topics > actual_num_topics:
                raise ValueError(
                    f"This dataset only has {actual_num_topics} topics, but you asked for a subset of {self._num_topics} topics."
                )

            topics_to_use = docs.index.unique(TOPIC_ID).to_series().sample(
                self._num_topics, random_state=0).values
            selected_docs = docs.loc[docs.index.get_level_values(
                TOPIC_ID).isin(topics_to_use)]
        else:
            selected_docs = docs

        # select subset of documents per topic
        if self._num_docs_per_topic is not None:
            selected_docs = selected_docs.groupby(
                TOPIC_ID, as_index=False).apply(lambda df: df.sample(
                    min(len(df), self._num_docs_per_topic), random_state=0))
            selected_docs.index = selected_docs.index.droplevel(0)
        selected_docs.sort_index(inplace=True)

        self.logger.warning(
            f"Number of documents limited to {len(selected_docs)}!")
        dataset.documents = selected_docs
        selected_doc_ids = dataset.documents[DOCUMENT_ID]

        dataset.tokens = dataset.tokens.loc[
            dataset.tokens.index.get_level_values(DOCUMENT_ID).isin(
                selected_doc_ids)]
        dataset.mentions_action = dataset.mentions_action.loc[
            dataset.mentions_action.index.get_level_values(DOCUMENT_ID).isin(
                selected_doc_ids)]

        if dataset.mentions_time is not None:
            dataset.mentions_time = dataset.mentions_time.loc[
                dataset.mentions_time.index.get_level_values(DOCUMENT_ID).isin(
                    selected_doc_ids)]

        if dataset.mentions_location is not None:
            dataset.mentions_location = dataset.mentions_location.loc[
                dataset.mentions_location.index.get_level_values(
                    DOCUMENT_ID).isin(selected_doc_ids)]

        if dataset.mentions_participants is not None:
            dataset.mentions_participants = dataset.mentions_participants.loc[
                dataset.mentions_participants.index.get_level_values(
                    DOCUMENT_ID).isin(selected_doc_ids)]

        if dataset.mentions_other is not None:
            dataset.mentions_other = dataset.mentions_other.loc[
                dataset.mentions_other.index.get_level_values(
                    DOCUMENT_ID).isin(selected_doc_ids)]
        return dataset
コード例 #7
0
    def _load_dataset(self) -> Dataset:
        self.logger.info("Reading raw data")
        # load full dataset
        tuples = football_reader_utils.read_split_data(
            self._sentence_level_data_dir,
            token_level_data_dir=self._token_level_data_dir)

        assert len(tuples) == 2, "Token-level extensions are mandatory"
        sentence_level_data, token_level_data = tuples

        # create Dataset object from sentence-level annotated data
        documents, tokens, _, _, _ = sentence_level_data
        mentions_action, mentions_participants, mentions_time, mentions_location, semantic_roles = token_level_data

        if self._drop_other_event_cluster:
            mentions_action = mentions_action.loc[
                mentions_action[EVENT] != "other_event"]

        # We may now have some documents which do not contain any mentions. Remove those to avoid trouble in the
        # model code later on.
        documents_without_mentions = set(
            documents[DOCUMENT_ID].unique()) - set(
                mentions_action.index.get_level_values(DOCUMENT_ID).unique())
        documents = documents.loc[~documents[DOCUMENT_ID].
                                  isin(documents_without_mentions)]
        tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index()
        mentions_participants = mentions_participants.loc[
            ~mentions_participants.index.get_level_values(DOCUMENT_ID).
            isin(documents_without_mentions)]
        mentions_location = mentions_location.loc[
            ~mentions_location.index.get_level_values(DOCUMENT_ID).
            isin(documents_without_mentions)]
        mentions_time = mentions_time.loc[~mentions_time.index.
                                          get_level_values(DOCUMENT_ID).
                                          isin(documents_without_mentions)]
        semantic_roles = semantic_roles.loc[~semantic_roles[DOCUMENT_ID].
                                            isin(documents_without_mentions)]

        documents.sort_index(inplace=True)
        tokens.sort_index(inplace=True)
        mentions_action.sort_index(inplace=True)
        mentions_participants.sort_index(inplace=True)
        mentions_time.sort_index(inplace=True)
        mentions_location.sort_index(inplace=True)

        dataset = Dataset(documents,
                          tokens,
                          mentions_action,
                          mentions_time=mentions_time,
                          mentions_location=mentions_location,
                          mentions_participants=mentions_participants,
                          semantic_roles=semantic_roles)

        return dataset
コード例 #8
0
    def _load_dataset(self) -> Dataset:
        self.logger.info("Reading raw data")
        # load full dataset
        tuples = football_reader_utils.read_split_data(
            self._sentence_level_data_dir)

        # create Dataset object from sentence-level annotated data
        documents, tokens, mentions_action, _, _ = tuples[0]

        # add token indices for action mentions, so that the format of this dataframe matches that of the other corpora
        max_token_index_in_sentence = tokens.index.to_frame(
            index=False).groupby([DOCUMENT_ID, SENTENCE_IDX])[TOKEN_IDX].max()
        mentions_action_with_max_token = mentions_action.reset_index().merge(
            max_token_index_in_sentence,
            on=[DOCUMENT_ID,
                SENTENCE_IDX]).rename(columns={TOKEN_IDX: TOKEN_IDX_TO})
        mentions_action_with_max_token[
            TOKEN_IDX_TO] += 1  # remember, we use exclusive span boundaries
        mentions_action_with_max_token[TOKEN_IDX_FROM] = 0
        mentions_action = mentions_action_with_max_token.set_index(
            [DOCUMENT_ID, MENTION_ID])

        # We may now have some documents which do not contain any mentions. Remove those to avoid trouble in the
        # model code later on.
        documents_without_mentions = set(
            documents[DOCUMENT_ID].unique()) - set(
                mentions_action.index.get_level_values(DOCUMENT_ID).unique())
        documents = documents.loc[~documents[DOCUMENT_ID].
                                  isin(documents_without_mentions)]
        tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index()

        documents.sort_index(inplace=True)
        tokens.sort_index(inplace=True)
        mentions_action.sort_index(inplace=True)

        dataset = Dataset(documents, tokens, mentions_action)

        return dataset
コード例 #9
0
    def _load_dataset(self) -> Dataset:
        self.logger.info("Reading raw data")
        documents, tokens, mentions, entities_events = ecb_reader_utils.read_split_data(
            self._path_to_data_split, self._sentence_filter_csv)

        # remove invalid cross-sentence mentions - there is for example one in 36_4ecbplus
        mentions_valid = mentions.loc[
            mentions[TOKEN_IDX_FROM] < mentions[TOKEN_IDX_TO]]
        if len(mentions_valid) < len(mentions):
            self.logger.warning(
                f"Removed {len(mentions) - len(mentions_valid)} invalid mention(s) present in the gold data."
            )
        mentions = mentions_valid

        # in 41_4ecb there is a participant mention with type "HUMAN_PART" which should be "HUMAN_PART_GPE"
        mentions[MENTION_TYPE] = mentions[MENTION_TYPE].replace(
            {"HUMAN_PART": HUMAN_PART_GPE})

        if self._topics_to_load is not None:
            # perform topic selection
            topics_to_load = {str(v) for v in self._topics_to_load}
            topics_in_split = set(
                documents.index.get_level_values(TOPIC_ID).unique())

            topics_not_present = topics_to_load - topics_in_split
            if topics_not_present:
                self.logger.warning(
                    f"Cannot load these topics because they are not part of the split: {', '.join(sorted(topics_not_present))}"
                )
            topics_to_load = list(topics_in_split & topics_to_load)
            if not topics_to_load:
                raise ValueError("At least one topic has to be selected")
            self.logger.info(
                f"Using topic(s) {', '.join(sorted(topics_to_load))}")

            # subselect
            documents = documents.loc[list(topics_to_load)].sort_index()
            tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index()
            mentions = mentions.loc[documents[DOCUMENT_ID]].sort_index()

        # obtain action mentions
        mentions_action = mentions.loc[mentions[MENTION_TYPE].isin(
            MENTION_TYPES_ACTION)].copy()

        # remove documents which contain no action mentions
        documents_without_mentions = set(
            documents[DOCUMENT_ID].unique()) - set(
                mentions_action.index.get_level_values(DOCUMENT_ID).unique())
        if documents_without_mentions:
            self.logger.info(
                f"The following documents contain no action mentions and were removed: {', '.join(sorted(documents_without_mentions))}"
            )
        documents = documents.loc[~documents[DOCUMENT_ID].
                                  isin(documents_without_mentions)]
        tokens = tokens.loc[documents[DOCUMENT_ID]].sort_index()
        mentions = mentions.loc[~mentions.index.get_level_values(DOCUMENT_ID).
                                isin(documents_without_mentions)]

        # now divide the remainder of mentions
        mentions_time = mentions.loc[mentions[MENTION_TYPE].isin(
            MENTION_TYPES_TIME)].rename(columns={EVENT: ENTITY})
        mentions_location = mentions.loc[mentions[MENTION_TYPE].isin(
            MENTION_TYPES_LOCATION)].rename(columns={EVENT: ENTITY})
        mentions_participants = mentions.loc[mentions[MENTION_TYPE].isin(
            MENTION_TYPES_PARTICIPANTS)].rename(columns={EVENT: ENTITY})
        assert len(mentions) == sum([
            len(df) for df in [
                mentions_action, mentions_time, mentions_location,
                mentions_participants
            ]
        ])

        dataset = Dataset(documents,
                          tokens,
                          mentions_action,
                          mentions_time=mentions_time,
                          mentions_location=mentions_location,
                          mentions_participants=mentions_participants)
        return dataset
コード例 #10
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        WIKIDATA_NAMESPACE = "http://www.wikidata.org/entity/"

        # determine for which QIDs we need to look up embeddings
        set_of_wikidata_qids = set()
        for df in [
                dataset.mentions_action, dataset.mentions_time,
                dataset.mentions_location, dataset.mentions_participants,
                dataset.mentions_other
        ]:
            if df is None:
                continue
            assert WIKIDATA_QID in df.columns, "Need to entity link against Wikidata first!"
            set_of_wikidata_qids |= set(
                df[WIKIDATA_QID].loc[df[WIKIDATA_QID].notna()].unique())

        wikidata_iris = {
            f"<{WIKIDATA_NAMESPACE}{qid}>": qid
            for qid in set_of_wikidata_qids
        }

        # load the relevant embedding vectors: use mmap_mode="r" to not load gigabytes of stuff into RAM
        mat_embedding = np.load(self._embedding_matrix_file, mmap_mode="r")
        num_terms = mat_embedding.shape[0]

        # Check the JSON index to find the indices of these QIDs in the pretrained embedding matrix. Use ijson to parse
        # the file incrementally, avoiding to load 3GB of JSON into RAM.
        qid_to_mat_embedding_index = {}
        qid_to_mat_embedding_subset_index = {}
        with self._json_index_file.open("rb") as f:
            for i, term in tqdm(enumerate(ijson.items(f, "item")),
                                desc="Looking up QIDs in embedding index",
                                mininterval=10,
                                total=num_terms,
                                unit="terms"):
                try:
                    unicode_term = term.encode().decode(
                        "unicode_escape").strip()
                except UnicodeDecodeError as e:
                    self.logger.warn(e)
                    continue
                if unicode_term in wikidata_iris.keys():
                    qid = wikidata_iris.pop(unicode_term)
                    qid_to_mat_embedding_index[qid] = i
                    qid_to_mat_embedding_subset_index[qid] = len(
                        qid_to_mat_embedding_subset_index)

                    # bail early if done
                    if not wikidata_iris:
                        self.logger.info("All QIDs found!")
                        break
        if wikidata_iris:
            self.logger.warning(
                f"The following {len(wikidata_iris)} Wikidata entities were not found in the pretrained embedding index:\n"
                + pformat(wikidata_iris))

        # look up relevant embeddings
        mat_embedding_subset = mat_embedding[list(
            qid_to_mat_embedding_index.values())]

        # and we're done
        wikidata_embeddings = (qid_to_mat_embedding_subset_index,
                               mat_embedding_subset)
        dataset.set(WIKIDATA_EMBEDDINGS, wikidata_embeddings)

        return dataset
コード例 #11
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        errors = False
        mentions_el = []

        time_of_last_query = 0
        for doc_id, df in tqdm(
                dataset.tokens.groupby(DOCUMENT_ID),
                desc=f"EL with {self._entity_linker_name} on documents",
                mininterval=10):
            doc_conjoined = "".join(df[TOKEN].values)
            doc_detokenized = self._detokenizer(df[TOKEN].values.tolist())

            # obtain response from entity linker: from cache if possible, otherwise create it fresh
            if not doc_detokenized in self._entity_linker_cache:
                now = time.time()
                try:
                    # apply rate limiting: make sure at least self._wait_between_requests_seconds seconds are between each request
                    time_to_sleep = max(
                        0,
                        self._get_waiting_time_between_requests_seconds(
                            live_objects) - (now - time_of_last_query))
                    time.sleep(time_to_sleep)
                    response = self._query_entity_linker(
                        doc_detokenized, live_objects)
                except (ValueError, HTTPError) as e:
                    self.logger.error(f"Entity linking error for {doc_id}", e)
                    errors = True
                    continue
                finally:
                    time_of_last_query = now

                self._entity_linker_cache[doc_detokenized] = response
            else:
                response = self._entity_linker_cache[doc_detokenized]

            if response is None:
                self.logger.info(f"No entities found for {doc_id}.")
                continue
            response_df = self._convert_el_response_to_dataframe(
                response, live_objects)

            # we first need to map the detokenized character offsets into our tokenized character offsets
            get_alignment = get_monotonous_character_alignment_func(
                doc_conjoined, doc_detokenized)
            response_df[CHARS_START] = response_df[CHARS_START].map(
                get_alignment)
            # we need to work around exclusive span boundaries here
            response_df[CHARS_END] = (response_df[CHARS_END] -
                                      1).map(get_alignment) + 1

            # now, we need to move from character offsets to tokens:
            # start offsets: the first token is associated with character 0, the second token with len(token[0]) and so on
            token_start_offsets = df[TOKEN].str.len().cumsum().shift(
                1, fill_value=0)
            response_df = response_df.merge(token_start_offsets.reset_index(),
                                            left_on=CHARS_START,
                                            right_on=TOKEN)
            response_df = response_df.drop(
                columns=[CHARS_START, TOKEN, SENTENCE_IDX, DOCUMENT_ID
                         ]).rename(columns={TOKEN_IDX: TOKEN_IDX_FROM})
            # end offsets: We work with exclusive boundaries. If a mention lies at the end of a sentence, then its
            # TOKEN_IDX_TO needs to be +1 the index of the last token in the sentence (basically going out of bounds).
            token_end_offsets = df[TOKEN].str.len().cumsum()
            response_df = response_df.merge(token_end_offsets.reset_index(),
                                            left_on=CHARS_END,
                                            right_on=TOKEN)
            response_df = response_df.drop(columns=[CHARS_END, TOKEN]).rename(
                columns={TOKEN_IDX: TOKEN_IDX_TO})
            response_df[TOKEN_IDX_TO] = response_df[
                TOKEN_IDX_TO] + 1  # here we +1 the token index for correct exclusive boundaries

            # final dataframe format:
            #   - index: doc_id, mention_id
            #   - values: all the things we want to keep: support, types, similarityScore, percentageOfSecondsRank, dbpedia-uri
            mentions_el_in_doc = response_df.reset_index().rename(
                columns={
                    "index": MENTION_ID
                }).set_index([DOCUMENT_ID, MENTION_ID])
            mentions_el.append(mentions_el_in_doc)
        if errors:
            raise ValueError(
                "Stopping because there were errors in the process.")

        mentions_el = pd.concat(mentions_el)

        # remove invalid spans, if any exist TODO fix the actual problem which is causing them
        mentions_el_valid = mentions_el.loc[
            mentions_el[TOKEN_IDX_FROM] < mentions_el[TOKEN_IDX_TO]]
        if len(mentions_el_valid) < len(mentions_el):
            self.logger.warning(
                f"Removed {len(mentions_el) - len(mentions_el_valid)} invalid mention(s) after DBpedia entity linking"
            )
        mentions_el = mentions_el_valid

        if not self.mode in [MODE_INTERSECT, MODE_EXTEND]:
            raise ValueError

        # set coarse entity type for each predicted entity mention
        coarse_type_to_dbpedia_type = {
            ACTION: "DBpedia:Event",
            PARTICIPANTS: "DBpedia:Agent",
            LOCATION: "DBpedia:Place",
            TIME: "DBpedia:TimePeriod"
        }
        for coarse_type, dbo_type in coarse_type_to_dbpedia_type.items():
            mentions_el.loc[mentions_el["types"].str.contains(dbo_type),
                            MENTION_TYPE_COARSE] = coarse_type
        mentions_el[MENTION_TYPE_COARSE] = mentions_el[
            MENTION_TYPE_COARSE].fillna(OTHER)

        # Enrich all gold mentions with new info from entity linking
        coarse_type_to_dataset_attr = {
            ACTION: "mentions_action",
            PARTICIPANTS: "mentions_participants",
            LOCATION: "mentions_location",
            TIME: "mentions_time",
            OTHER: "mentions_other"
        }
        for coarse_type, attr in sorted(coarse_type_to_dataset_attr.items()):
            mentions_el_of_coarse_type = mentions_el.loc[
                mentions_el[MENTION_TYPE_COARSE] == coarse_type].drop(
                    columns=MENTION_TYPE_COARSE)

            # If the dataset did not contain any mentions of this type, simply assign all predicted mentions. Otherwise
            # left-join all the new columns produced by the entity linking to the gold annotations. We make sure only
            # to join entities which match the type of the gold annotations. Otherwise "The Real Housewives of Beverly
            # Hills" will be joined to "in Beverly Hills", which causes more trouble than necessary.
            dataset_mentions = getattr(dataset, attr, None)
            if dataset_mentions is None:
                new_dataset_mentions = mentions_el_of_coarse_type
            else:
                columns_keep_gold = dataset_mentions.columns
                columns_keep_system = mentions_el.columns.drop([
                    TOKEN_IDX_FROM, TOKEN_IDX_TO, SENTENCE_IDX, MENTION_TEXT,
                    MENTION_TYPE_COARSE
                ])
                new_dataset_mentions = left_join_predictions(
                    dataset_mentions, mentions_el_of_coarse_type,
                    columns_keep_gold, columns_keep_system)
            setattr(dataset, attr, new_dataset_mentions)

        if self.mode == MODE_INTERSECT:
            self.logger.info(
                "Intersected new annotations with dataset from previous pipeline stages."
            )
        elif self.mode == MODE_EXTEND:
            self.logger.info(
                "Extending dataset entities with those found during entity linking..."
            )

            # add all non-overlapping mentions found via entity linking to the dataset
            mentions_el_to_add = outer_join_predictions(mentions_el,
                                                        dataset).copy()

            for coarse_type, attr in coarse_type_to_dataset_attr.items():
                # skipping the extension for actions is of crucial importance here, otherwise we would be adding
                # additional event mentions to the dataset!
                if coarse_type == ACTION:
                    continue
                mentions_el_to_add_of_coarse_type = mentions_el_to_add.loc[
                    mentions_el_to_add[MENTION_TYPE_COARSE] ==
                    coarse_type].drop(columns=MENTION_TYPE_COARSE)

                dataset_mentions = getattr(dataset, attr, None)
                assert dataset_mentions is not None  # this can't be since we must have assigned something in the similar loop above

                new_dataset_mentions = pd.concat(
                    [dataset_mentions,
                     mentions_el_to_add_of_coarse_type]).sort_index()
                setattr(dataset, attr, new_dataset_mentions)

        # assert that there are no "backwards spans", this has caused issues way too many times...
        for attr in coarse_type_to_dataset_attr.values():
            mentions_df = getattr(dataset, attr)
            assert mentions_df.loc[
                mentions_df[TOKEN_IDX_FROM] >= mentions_df[TOKEN_IDX_TO]].empty

        # make sure to add the mention text to each mention
        def get_mention_text_from_mention(row: pd.Series) -> str:
            return " ".join(dataset.tokens.loc[(
                row.name[0], row[SENTENCE_IDX],
                slice(row[TOKEN_IDX_FROM], row[TOKEN_IDX_TO] - 1)),
                                               TOKEN].values)

        dataset.mentions_action[MENTION_TEXT] = dataset.mentions_action.apply(
            get_mention_text_from_mention, axis=1)
        dataset.mentions_participants[
            MENTION_TEXT] = dataset.mentions_participants.apply(
                get_mention_text_from_mention, axis=1)
        dataset.mentions_time[MENTION_TEXT] = dataset.mentions_time.apply(
            get_mention_text_from_mention, axis=1)
        dataset.mentions_location[
            MENTION_TEXT] = dataset.mentions_location.apply(
                get_mention_text_from_mention, axis=1)
        if dataset.mentions_other is not None:
            dataset.mentions_other[
                MENTION_TEXT] = dataset.mentions_other.apply(
                    get_mention_text_from_mention, axis=1)

        return dataset
コード例 #12
0
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]], unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(WIKIDATA_EMBEDDINGS)
        wikidata_embeddings = dataset.get(WIKIDATA_EMBEDDINGS)  # type: Tuple[Dict[str, int], np.array]
        embedding_index, embedding_mat = wikidata_embeddings

        # create one large dataframe of all named entities which are entity linked to Wikidata
        linked_event_components = []
        for mention_type_coarse, df in {ACTION: dataset.mentions_action,
                                        PARTICIPANTS: dataset.mentions_participants,
                                        TIME: dataset.mentions_time,
                                        LOCATION: dataset.mentions_location,
                                        OTHER: dataset.mentions_other}.items():
            if df is None:
                continue

            # keep only entities/mentions which are linked to Wikidata
            linked_subset = df.loc[df[WIKIDATA_QID].notna()]
            # drop those linked embeddings for which we don't have an embedding
            with_embedding = linked_subset.loc[linked_subset[WIKIDATA_QID].isin(embedding_index.keys())]
            # keep only relevant columns
            only_relevant_columns = with_embedding.reindex(columns=[MENTION_TEXT, SENTENCE_IDX, WIKIDATA_QID])

            only_relevant_columns[MENTION_TYPE_COARSE] = mention_type_coarse

            linked_event_components.append(only_relevant_columns)
        linked_event_components = pd.concat(linked_event_components).set_index(MENTION_TYPE_COARSE, append=True)
        assert linked_event_components.index.is_unique

        # convert QID into index of the corresponding embedding in `embedding_mat`
        linked_event_components[WIKIDATA_QID] = linked_event_components[WIKIDATA_QID].map(embedding_index)
        assert linked_event_components[WIKIDATA_QID].notna().all() and not linked_event_components[WIKIDATA_QID].astype(
            str).str.startswith("Q").any()

        linked_event_components = linked_event_components.reset_index()
        mentions_action = dataset.mentions_action
        sr = dataset.semantic_roles

        # precompute embedding matrices for each action mention
        precomputed = {}
        for mention_idx in unique_mentions:
            assert len(mention_idx) == 2
            doc_id, mention_id = mention_idx

            linked_in_doc = linked_event_components.loc[linked_event_components[DOCUMENT_ID] == doc_id]

            # look up embedding for action mention (rarely the case)
            linked_action_mention = linked_in_doc.loc[
                (linked_in_doc[MENTION_TYPE_COARSE] == ACTION) & (linked_in_doc[MENTION_ID] == mention_id)]
            if not linked_action_mention.empty:
                action_mention_embedding = embedding_mat[linked_action_mention[WIKIDATA_QID].values]
            else:
                action_mention_embedding = None

            # if available, create matrix of embeddings from all entity linked SRL arguments
            srl_args_of_mention = sr.loc[(sr[DOCUMENT_ID] == doc_id) & (sr[MENTION_ID] == mention_id)]
            if not srl_args_of_mention.empty:
                linked_srl_args_for_mention = srl_args_of_mention.merge(linked_in_doc,
                                                                        left_on=[COMPONENT_MENTION_ID,
                                                                                 MENTION_TYPE_COARSE],
                                                                        right_on=[MENTION_ID,
                                                                                  MENTION_TYPE_COARSE]).drop_duplicates(WIKIDATA_QID)
                linked_srl_embeddings = embedding_mat[linked_srl_args_for_mention[WIKIDATA_QID].values]
            else:
                linked_srl_embeddings = None

            # create matrix of embeddings from all linked entities in the same sentence as the action mention
            sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX]

            linked_in_surrounding_sent = linked_in_doc.loc[
                linked_in_doc[SENTENCE_IDX] == sent_idx_of_action].drop_duplicates(WIKIDATA_QID)
            if not linked_in_surrounding_sent.empty:
                surrounding_sent_embeddings = embedding_mat[linked_in_surrounding_sent[WIKIDATA_QID].values]
            else:
                surrounding_sent_embeddings = None

            # create matrix of embeddings from all linked entities in the context of the action mention
            NUM_SENTENCES_CONTEXT = 2
            sent_idx_from = sent_idx_of_action - NUM_SENTENCES_CONTEXT
            sent_idx_to = sent_idx_of_action + NUM_SENTENCES_CONTEXT

            linked_in_context = linked_in_doc.loc[(linked_in_doc[SENTENCE_IDX] >= sent_idx_from) & (
                    linked_in_doc[SENTENCE_IDX] <= sent_idx_to)].drop_duplicates(WIKIDATA_QID)
            if not linked_in_context.empty:
                context_embeddings = embedding_mat[linked_in_context[WIKIDATA_QID].values]
            else:
                context_embeddings = None

            # create matrix of embeddings from linked entities at the document start
            NUM_SENTENCES_DOC_START = 3
            linked_at_doc_start = linked_in_doc.loc[
                (linked_in_doc[SENTENCE_IDX] < NUM_SENTENCES_DOC_START)].drop_duplicates(WIKIDATA_QID)
            if not linked_at_doc_start.empty:
                doc_start_embeddings = embedding_mat[linked_at_doc_start[WIKIDATA_QID].values]
            else:
                doc_start_embeddings = None

            precomputed[mention_idx] = {ACTION_MENTION: action_mention_embedding,
                                        SEMANTIC_ROLE_ARGS: linked_srl_embeddings,
                                        SURROUNDING_SENTENCE: surrounding_sent_embeddings,
                                        SENTENCE_CONTEXT: context_embeddings,
                                        DOC_START: doc_start_embeddings}

        # using the precomputed action mention representations, compute pairwise features
        list_of_instance_features = []
        for pair in pairs:
            a_idx, b_idx = pair

            instance_features = []

            # compute distance between action mention embeddings
            a_action_mention_mat = precomputed[a_idx][ACTION_MENTION]
            b_action_mention_mat = precomputed[b_idx][ACTION_MENTION]
            if a_action_mention_mat is None or b_action_mention_mat is None:
                instance_features.append(None)
            else:
                instance_features.append(cosine(a_action_mention_mat, b_action_mention_mat))

            # the order is important here, it has to match the names in __init__!
            for key in FEATURES_IN_ORDER:
                a_mat = precomputed[a_idx][key]
                b_mat = precomputed[b_idx][key]
                features_of_key = compute_pairwise_embedding_distance_features(a_mat, b_mat)
                instance_features += features_of_key

            instance_features = np.array(instance_features, dtype=self.dtype)
            list_of_instance_features.append(instance_features)

        feature_matrix = np.vstack(list_of_instance_features)
        return feature_matrix
コード例 #13
0
    def _process_dataset(self, dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        # masking an event component entails replacing all mention tokens with a random dummy token, followed by
        # nulling all additional preprocessing columns to keep features from working off of those
        if "action" in self._mask_what:
            mentions_action = dataset.mentions_action
            tokens = self._mask_tokens(dataset.tokens, mentions_action)
            mentions_action = self._fill_columns_with_na(
                mentions_action, [DBPEDIA_URI, WIKIDATA_QID])

            dataset.mentions_action = mentions_action
            dataset.tokens = tokens

        if "participants" in self._mask_what:
            mentions_participants = dataset.mentions_participants
            semantic_roles = dataset.semantic_roles
            tokens = self._mask_tokens(dataset.tokens, mentions_participants)

            # remove all participant mentions and corresponding SRL entries
            mentions_participants = mentions_participants.iloc[0:0]
            semantic_roles = semantic_roles.loc[
                semantic_roles[MENTION_TYPE_COARSE] != PARTICIPANTS]

            dataset.mentions_participants = mentions_participants
            dataset.semantic_roles = semantic_roles
            dataset.tokens = tokens

        if "location" in self._mask_what:
            mentions_location = dataset.mentions_location
            semantic_roles = dataset.semantic_roles
            tokens = self._mask_tokens(dataset.tokens, mentions_location)

            # remove all location mentions and corresponding SRL entries
            mentions_location = mentions_location.iloc[0:0]
            semantic_roles = semantic_roles.loc[
                semantic_roles[MENTION_TYPE_COARSE] != LOCATION]

            dataset.mentions_location = mentions_location
            dataset.semantic_roles = semantic_roles
            dataset.tokens = tokens

        if "time" in self._mask_what:
            mentions_time = dataset.mentions_time
            semantic_roles = dataset.semantic_roles
            tokens = self._mask_tokens(dataset.tokens, mentions_time)

            # remove all temporal mentions and corresponding SRL entries
            mentions_time = mentions_time.iloc[0:0]
            semantic_roles = semantic_roles.loc[
                semantic_roles[MENTION_TYPE_COARSE] != TIME]

            dataset.mentions_time = mentions_time
            dataset.semantic_roles = semantic_roles
            dataset.tokens = tokens

        if "publish_date" in self._mask_what:
            documents = dataset.documents
            if PUBLISH_DATE in documents.columns:
                documents.drop(columns=PUBLISH_DATE, inplace=True)
            dataset.documents = documents

        return dataset
コード例 #14
0
    def _process_dataset(self,
                         dataset: Dataset,
                         live_objects: Dict) -> Dataset:
        semantic_roles = []

        # determine sentences with action mentions
        for doc_id, mentions_action_doc in tqdm(dataset.mentions_action.groupby(DOCUMENT_ID),
                                                desc="SRL on documents",
                                                mininterval=10):
            # skip documents for which we already have semantic roles
            if dataset.semantic_roles is not None and doc_id in dataset.semantic_roles[DOCUMENT_ID]:
                continue

            for sent_idx, mentions_action_sent in mentions_action_doc.groupby(SENTENCE_IDX):
                # run SRL:
                # AllenNLP SRL models can accept a tokenized sentence and one verbal predicate and returns argument class probabilities
                # per token, which can be converted to BIO via Viterbi. Notably, there is no possibility to feed in pre-recognized
                # argument spans, so the spans recognized by the model need to be reconciled manually. Also, only _verbal_ predicates
                # are supported.
                tokenized_sentence = dataset.tokens.loc[(doc_id, sent_idx), TOKEN].values

                # predict SRL or obtain from cache
                if tokenized_sentence not in self._cache:
                    srl_prediction = self._srl_predictor.predict_tokenized(tokenized_sentence)
                    self._cache[tokenized_sentence] = srl_prediction
                else:
                    srl_prediction = self._cache[tokenized_sentence]

                # srl_spans: for each verbal predicate in the sentence, a list of tags and their span
                srl_spans = []  # type: List[List[Tuple[str, Tuple[int, int]]]]
                for predicate in srl_prediction["verbs"]:
                    tag_spans_inclusive = bio_tags_to_spans(predicate["tags"])
                    # switch from inclusive span boundaries to exclusive ones
                    tag_spans = [(tag, (start, end + 1)) for (tag, (start, end)) in tag_spans_inclusive]
                    srl_spans.append(tag_spans)

                # (start, end) token indices of each detected verb and preannotated actions in the current sentence
                srl_verb_spans = [(start, end) for predicate_spans in srl_spans for (tag, (start, end)) in
                                  predicate_spans if tag == "V"]
                mention_action_spans = mentions_action_sent[[TOKEN_IDX_FROM, TOKEN_IDX_TO]].values.tolist()

                # Map verbs returned from SRL to action mentions via sentence position: We have n pre-annotated action
                # mentions and m predicates found by SRL. We want to find the best 1:1 assignment from predicate to
                # mention. We approach this as a linear assignment problem.
                map_from_preannotated_action_to_srl_predicate = span_matching(mention_action_spans, srl_verb_spans)

                # for those where mapping exists:
                for i_action, i_predicate in map_from_preannotated_action_to_srl_predicate.items():
                    action = mentions_action_sent.iloc[i_action]
                    action_mention_id = action.name[mentions_action_sent.index.names.index(MENTION_ID)]
                    tag_spans = srl_spans[i_predicate]

                    # map time, location, participants to annotations
                    event_component_rows = []

                    def find_event_component_mapping(mentions_df: pd.DataFrame, srl_target_tags: List[str],
                                                     coarse_mention_type: str):
                        # it can happen that there is no time/location/participant annotated in a sentence; otherwise,
                        # look up mentions in the sentence
                        if not doc_id in mentions_df.index or not sent_idx in mentions_df.loc[doc_id, SENTENCE_IDX]:
                            return
                        mentions_within_doc = mentions_df.loc[doc_id]
                        mentions_within_sentence = mentions_within_doc.loc[mentions_within_doc[SENTENCE_IDX] == sent_idx]

                        mention_spans_within_sentence = mentions_within_sentence[[TOKEN_IDX_FROM, TOKEN_IDX_TO]].values.tolist()
                        _srl_spans = [(start, end) for (tag, (start, end)) in tag_spans if tag in srl_target_tags]
                        mapping = span_matching(mention_spans_within_sentence, _srl_spans)

                        for idx_mention, idx_srl in mapping.items():
                            # 'name' is the only remaining index column here, which is MENTION_ID
                            mapped_mention_id = mentions_within_sentence.iloc[idx_mention].name
                            row = {MENTION_TYPE_COARSE: coarse_mention_type,
                                   COMPONENT_MENTION_ID: mapped_mention_id}
                            event_component_rows.append(row)

                    find_event_component_mapping(dataset.mentions_location, ["ARGM-DIR", "ARGM-LOC"], LOCATION)
                    find_event_component_mapping(dataset.mentions_time, ["ARGM-TMP"], TIME)
                    find_event_component_mapping(dataset.mentions_participants, ["ARG0", "ARG1"], PARTICIPANTS)

                    # Collect it all in a dataframe:
                    # For each action mention:
                    #   - index-y (not an actual index): doc-id, mention-id (this is the action mention id), sent_idx <<-- redundant
                    #   - columns: mention-type-coarse, component-mention-id (the mention associated with its action, and the mention type)
                    if event_component_rows:
                        event_components = pd.DataFrame(event_component_rows)
                        event_components[DOCUMENT_ID] = doc_id
                        event_components[MENTION_ID] = action_mention_id

                        semantic_roles.append(event_components)

        if len(semantic_roles) == 0:
            raise ValueError("No semantic roles found. Possible reasons: (1) Dataset already has semantic roles defined. (2) Pretrained SRL predictor likely does not match allennlp version! Check project README for details.")

        # merge identified event components of each sentence and mention into one dataframe
        semantic_roles = pd.concat(semantic_roles, sort=True)

        # concatenate with existing roles
        if dataset.semantic_roles is not None:
            semantic_roles = pd.concat([semantic_roles, dataset.semantic_roles], ignore_index=True)

        dataset.semantic_roles = semantic_roles
        return dataset