コード例 #1
0
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]],
                   unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(SENTENCE_EMBEDDINGS)
        sentence_embeddings = dataset.get(
            SENTENCE_EMBEDDINGS
        )  # type: Tuple[Dict[Tuple[str, int], int], np.array]
        embedding_index, embedding_mat = sentence_embeddings

        mentions_action = dataset.mentions_action

        # compute a mean embedding in case we need to pad somewhere
        mean_embedding = embedding_mat.mean(axis=0)

        # precompute embedding matrices for each action mention
        precomputed_sentence = {}
        precomputed_doc_start = {}
        for mention_idx in unique_mentions:
            assert len(mention_idx) == 2
            doc_id, mention_id = mention_idx

            # look up sentence embedding of the sentence containing the action mention
            sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX]
            surrounding_sent_embedding = embedding_mat[embedding_index[(
                doc_id, sent_idx_of_action)]]

            # for the document start, take n sentences from the start of the document and concatenate their embeddings
            NUM_SENTENCES_DOC_START = 3
            doc_start_sent_embeddings = []
            for i in range(NUM_SENTENCES_DOC_START):
                # there might be documents shorter than NUM_SENTENCES_DOC_START, therefore check: if there are not
                # enough sentences, pad with the mean embedding
                if (doc_id, i) in embedding_index:
                    sent_embedding = embedding_mat[embedding_index[(doc_id,
                                                                    i)]]
                else:
                    sent_embedding = mean_embedding
                doc_start_sent_embeddings.append(sent_embedding)
            doc_start_embedding = np.hstack(doc_start_sent_embeddings)

            precomputed_sentence[mention_idx] = surrounding_sent_embedding
            precomputed_doc_start[mention_idx] = doc_start_embedding

        feature_columns = []
        for vectors, feature_desc in [(precomputed_sentence,
                                       SURROUNDING_SENTENCE),
                                      (precomputed_doc_start, DOC_START)]:
            feature_column = batch_cosine_similarity(
                pairs, vectors, desc=f"{self.name} {feature_desc}")
            feature_columns.append(feature_column)
        feature_matrix = np.hstack(feature_columns)
        return feature_matrix
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]],
                   unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(ACTION_PHRASE_EMBEDDINGS)
        action_phrase_embeddings = dataset.get(
            ACTION_PHRASE_EMBEDDINGS
        )  # type: Tuple[Dict[Tuple[str, int], int], np.array]
        embedding_index, embedding_mat = action_phrase_embeddings

        pairs_transform = lambda idx: embedding_index[idx]

        feature_column = batch_cosine_similarity(
            pairs,
            embedding_mat,
            pairs_transform=pairs_transform,
            desc=self.name)
        feature_matrix = feature_column.reshape((-1, 1))
        return feature_matrix
    def _transform(self, dataset: Dataset, pairs: List[Tuple[Tuple, Tuple]], unique_mentions: Set[Tuple]):
        # obtain embeddings
        assert dataset.has(WIKIDATA_EMBEDDINGS)
        wikidata_embeddings = dataset.get(WIKIDATA_EMBEDDINGS)  # type: Tuple[Dict[str, int], np.array]
        embedding_index, embedding_mat = wikidata_embeddings

        # create one large dataframe of all named entities which are entity linked to Wikidata
        linked_event_components = []
        for mention_type_coarse, df in {ACTION: dataset.mentions_action,
                                        PARTICIPANTS: dataset.mentions_participants,
                                        TIME: dataset.mentions_time,
                                        LOCATION: dataset.mentions_location,
                                        OTHER: dataset.mentions_other}.items():
            if df is None:
                continue

            # keep only entities/mentions which are linked to Wikidata
            linked_subset = df.loc[df[WIKIDATA_QID].notna()]
            # drop those linked embeddings for which we don't have an embedding
            with_embedding = linked_subset.loc[linked_subset[WIKIDATA_QID].isin(embedding_index.keys())]
            # keep only relevant columns
            only_relevant_columns = with_embedding.reindex(columns=[MENTION_TEXT, SENTENCE_IDX, WIKIDATA_QID])

            only_relevant_columns[MENTION_TYPE_COARSE] = mention_type_coarse

            linked_event_components.append(only_relevant_columns)
        linked_event_components = pd.concat(linked_event_components).set_index(MENTION_TYPE_COARSE, append=True)
        assert linked_event_components.index.is_unique

        # convert QID into index of the corresponding embedding in `embedding_mat`
        linked_event_components[WIKIDATA_QID] = linked_event_components[WIKIDATA_QID].map(embedding_index)
        assert linked_event_components[WIKIDATA_QID].notna().all() and not linked_event_components[WIKIDATA_QID].astype(
            str).str.startswith("Q").any()

        linked_event_components = linked_event_components.reset_index()
        mentions_action = dataset.mentions_action
        sr = dataset.semantic_roles

        # precompute embedding matrices for each action mention
        precomputed = {}
        for mention_idx in unique_mentions:
            assert len(mention_idx) == 2
            doc_id, mention_id = mention_idx

            linked_in_doc = linked_event_components.loc[linked_event_components[DOCUMENT_ID] == doc_id]

            # look up embedding for action mention (rarely the case)
            linked_action_mention = linked_in_doc.loc[
                (linked_in_doc[MENTION_TYPE_COARSE] == ACTION) & (linked_in_doc[MENTION_ID] == mention_id)]
            if not linked_action_mention.empty:
                action_mention_embedding = embedding_mat[linked_action_mention[WIKIDATA_QID].values]
            else:
                action_mention_embedding = None

            # if available, create matrix of embeddings from all entity linked SRL arguments
            srl_args_of_mention = sr.loc[(sr[DOCUMENT_ID] == doc_id) & (sr[MENTION_ID] == mention_id)]
            if not srl_args_of_mention.empty:
                linked_srl_args_for_mention = srl_args_of_mention.merge(linked_in_doc,
                                                                        left_on=[COMPONENT_MENTION_ID,
                                                                                 MENTION_TYPE_COARSE],
                                                                        right_on=[MENTION_ID,
                                                                                  MENTION_TYPE_COARSE]).drop_duplicates(WIKIDATA_QID)
                linked_srl_embeddings = embedding_mat[linked_srl_args_for_mention[WIKIDATA_QID].values]
            else:
                linked_srl_embeddings = None

            # create matrix of embeddings from all linked entities in the same sentence as the action mention
            sent_idx_of_action = mentions_action.loc[mention_idx, SENTENCE_IDX]

            linked_in_surrounding_sent = linked_in_doc.loc[
                linked_in_doc[SENTENCE_IDX] == sent_idx_of_action].drop_duplicates(WIKIDATA_QID)
            if not linked_in_surrounding_sent.empty:
                surrounding_sent_embeddings = embedding_mat[linked_in_surrounding_sent[WIKIDATA_QID].values]
            else:
                surrounding_sent_embeddings = None

            # create matrix of embeddings from all linked entities in the context of the action mention
            NUM_SENTENCES_CONTEXT = 2
            sent_idx_from = sent_idx_of_action - NUM_SENTENCES_CONTEXT
            sent_idx_to = sent_idx_of_action + NUM_SENTENCES_CONTEXT

            linked_in_context = linked_in_doc.loc[(linked_in_doc[SENTENCE_IDX] >= sent_idx_from) & (
                    linked_in_doc[SENTENCE_IDX] <= sent_idx_to)].drop_duplicates(WIKIDATA_QID)
            if not linked_in_context.empty:
                context_embeddings = embedding_mat[linked_in_context[WIKIDATA_QID].values]
            else:
                context_embeddings = None

            # create matrix of embeddings from linked entities at the document start
            NUM_SENTENCES_DOC_START = 3
            linked_at_doc_start = linked_in_doc.loc[
                (linked_in_doc[SENTENCE_IDX] < NUM_SENTENCES_DOC_START)].drop_duplicates(WIKIDATA_QID)
            if not linked_at_doc_start.empty:
                doc_start_embeddings = embedding_mat[linked_at_doc_start[WIKIDATA_QID].values]
            else:
                doc_start_embeddings = None

            precomputed[mention_idx] = {ACTION_MENTION: action_mention_embedding,
                                        SEMANTIC_ROLE_ARGS: linked_srl_embeddings,
                                        SURROUNDING_SENTENCE: surrounding_sent_embeddings,
                                        SENTENCE_CONTEXT: context_embeddings,
                                        DOC_START: doc_start_embeddings}

        # using the precomputed action mention representations, compute pairwise features
        list_of_instance_features = []
        for pair in pairs:
            a_idx, b_idx = pair

            instance_features = []

            # compute distance between action mention embeddings
            a_action_mention_mat = precomputed[a_idx][ACTION_MENTION]
            b_action_mention_mat = precomputed[b_idx][ACTION_MENTION]
            if a_action_mention_mat is None or b_action_mention_mat is None:
                instance_features.append(None)
            else:
                instance_features.append(cosine(a_action_mention_mat, b_action_mention_mat))

            # the order is important here, it has to match the names in __init__!
            for key in FEATURES_IN_ORDER:
                a_mat = precomputed[a_idx][key]
                b_mat = precomputed[b_idx][key]
                features_of_key = compute_pairwise_embedding_distance_features(a_mat, b_mat)
                instance_features += features_of_key

            instance_features = np.array(instance_features, dtype=self.dtype)
            list_of_instance_features.append(instance_features)

        feature_matrix = np.vstack(list_of_instance_features)
        return feature_matrix