Beispiel #1
0
 def get_pair_mentions_features(self, m1, m2):
     ''' Features for pair of mentions (same speakers, speaker mentioned, string match)'''
     features_ = {"00_SameSpeaker": 1 if self.consider_speakers and m1.speaker == m2.speaker else 0,
                  "01_AntMatchMentionSpeaker": 1 if self.consider_speakers and m2.speaker_match_mention(m1) else 0,
                  "02_MentionMatchSpeaker": 1 if self.consider_speakers and m1.speaker_match_mention(m2) else 0,
                  "03_HeadsAgree": 1 if m1.heads_agree(m2) else 0,
                  "04_ExactStringMatch": 1 if m1.exact_match(m2) else 0,
                  "05_RelaxedStringMatch": 1 if m1.relaxed_match(m2) else 0,
                  "06_SentenceDistance": m2.utterances_sent - m1.utterances_sent,
                  "07_MentionDistance": m2.index - m1.index - 1,
                  "08_Overlapping": 1 if (m1.utterances_sent == m2.utterances_sent and m1.end > m2.start) else 0,
                  "09_M1Features": m1.features_,
                  "10_M2Features": m2.features_,
                  "11_DocGenre": self.genre_}
     pairwise_features = [np.array([features_["00_SameSpeaker"],
                                    features_["01_AntMatchMentionSpeaker"],
                                    features_["02_MentionMatchSpeaker"],
                                    features_["03_HeadsAgree"],
                                    features_["04_ExactStringMatch"],
                                    features_["05_RelaxedStringMatch"]]),
                          encode_distance(features_["06_SentenceDistance"]),
                          encode_distance(features_["07_MentionDistance"]),
                          np.array(features_["08_Overlapping"], ndmin=1),
                          m1.features,
                          m2.features,
                          self.genre]
     return (features_, np.concatenate(pairwise_features, axis=0))
Beispiel #2
0
 def set_mentions_features(self):
     '''
     Compute features for the extracted mentions
     '''
     doc_embedding = self.embed_extractor.get_document_embedding(self.utterances) if self.embed_extractor is not None else None
     for mention in self.mentions:
         one_hot_type = np.zeros((4,))
         one_hot_type[mention.mention_type] = 1
         features_ = {"01_MentionType": mention.mention_type,
                      "02_MentionLength": len(mention)-1,
                      "03_MentionNormLocation": (mention.index)/len(self.mentions),
                      "04_IsMentionNested": 1 if any((m is not mention
                                                       and m.utterances_sent == mention.utterances_sent
                                                       and m.start <= mention.start
                                                       and mention.end <= m.end)
                                                      for m in self.mentions) else 0}
         features = np.concatenate([one_hot_type,
                                    encode_distance(features_["02_MentionLength"]),
                                    np.array(features_["03_MentionNormLocation"], ndmin=1, copy=False),
                                    np.array(features_["04_IsMentionNested"], ndmin=1, copy=False)
                                   ], axis=0)
         (spans_embeddings_, words_embeddings_,
          spans_embeddings, words_embeddings) = self.embed_extractor.get_mention_embeddings(mention, doc_embedding)
         mention.features_ = features_
         mention.features = features
         mention.spans_embeddings = spans_embeddings
         mention.spans_embeddings_ = spans_embeddings_
         mention.words_embeddings = words_embeddings
         mention.words_embeddings_ = words_embeddings_
Beispiel #3
0
    def __getitem__(self, mention_idx, debug=False):
        """
        Return:
            Definitions:
                P is the number of antecedent per mention (number of pairs for the mention)
                S = 250 is the size of the span vector (averaged word embeddings)
                W = 8 is the number of words in a mention (tuned embeddings)
                Fp = 70 is the number of features for a pair of mention
                Fs = 24 is the number of features of a single mention

            if there are some pairs:
                inputs = (spans, words, features, ant_spans, ant_words, ana_spans, ana_words, pairs_features)
                targets = (labels, costs, true_ants, false_ants)
            else:
                inputs = (spans, words, features)
                targets = (labels, costs, true_ants)

            inputs: Tuple of
                spans => (S,)
                words => (W,)
                features => (Fs,)
                + if there are potential antecedents (P > 0):
                    ant_spans => (P, S) or nothing if no pairs
                    ant_words => (P, W) or nothing if no pairs
                    ana_spans => (P, S) or nothing if no pairs
                    ana_words => (P, W) or nothing if no pairs
                    pair_features => (P, Fp) or nothing if no pairs

            targets: Tuple of
                labels => (P+1,)
                costs => (P+1,)
                true_ant => (P+1,)
                + if there are potential antecedents (P > 0):
                    false_ant => (P+1,)

        """
        features_raw, label, pairs_length, pairs_start_index, spans, words = self.mentions[mention_idx]
        pairs_start_index = np.asscalar(pairs_start_index)
        pairs_length = np.asscalar(pairs_length)

        # Build features array (float) from raw features (int)
        assert features_raw.shape[0] == SIZE_FS_COMPRESSED
        features = np.zeros((SIZE_FS,))
        features[features_raw[0]] = 1
        features[4:15] = encode_distance(features_raw[1])
        features[15] = features_raw[2].astype(float) / features_raw[3].astype(float)
        features[16] = features_raw[4]
        features[features_raw[5] + 17] = 1

        if pairs_length == 0:
            spans = torch.from_numpy(spans).float()
            words = torch.from_numpy(words)
            features = torch.from_numpy(features).float()
            inputs = (spans, words, features)
            if self.no_targets:
                return inputs
            true_ant = torch.zeros(1).long() # zeros = indices of true ant
            costs = torch.from_numpy((1 - label) * self.costs['FN']).float()
            label = torch.from_numpy(label).float()
            targets = (label, costs, true_ant)
            if debug:
                print("inputs shapes: ", [a.size() for a in inputs])
                print("targets shapes: ", [a.size() for a in targets])
            return inputs, targets

        start = pairs_start_index
        end = pairs_start_index + pairs_length
        pairs = self.pairs[start:end]
        assert len(pairs) == pairs_length
        assert len(pairs[0]) == 3 # pair[i] = (pairs_ant_index, pairs_features, pairs_labels)
        pairs_ant_index, pairs_features_raw, pairs_labels = list(zip(*pairs))

        pairs_features_raw = np.stack(pairs_features_raw)
        pairs_labels = np.squeeze(np.stack(pairs_labels), axis=1)

        # Build pair features array (float) from raw features (int)
        assert pairs_features_raw[0, :].shape[0] == SIZE_FP_COMPRESSED
        pairs_features = np.zeros((len(pairs_ant_index), SIZE_FP))
        pairs_features[:, 0:6] = pairs_features_raw[:, 0:6]
        pairs_features[:, 6:17] = encode_distance(pairs_features_raw[:, 6])
        pairs_features[:, 17:28] = encode_distance(pairs_features_raw[:, 7])
        pairs_features[:, 28] = pairs_features_raw[:, 8]
        # prepare antecent features
        ant_features_raw = np.concatenate([self.mentions[np.asscalar(idx)][0][np.newaxis, :] for idx in pairs_ant_index])
        ant_features = np.zeros((pairs_length, SIZE_FS-SIZE_GENRE))
        ant_features[:, ant_features_raw[:, 0]] = 1
        ant_features[:, 4:15] = encode_distance(ant_features_raw[:, 1])
        ant_features[:, 15] = ant_features_raw[:, 2].astype(float) / ant_features_raw[:, 3].astype(float)
        ant_features[:, 16] = ant_features_raw[:, 4]
        pairs_features[:, 29:46] = ant_features
        # Here we keep the genre 
        ana_features = np.tile(features, (pairs_length, 1))
        pairs_features[:, 46:] = ana_features

        ant_spans = np.concatenate([self.mentions[np.asscalar(idx)][4][np.newaxis, :] for idx in pairs_ant_index])
        ant_words = np.concatenate([self.mentions[np.asscalar(idx)][5][np.newaxis, :] for idx in pairs_ant_index])
        ana_spans = np.tile(spans, (pairs_length, 1))
        ana_words = np.tile(words, (pairs_length, 1))
        ant_spans = torch.from_numpy(ant_spans).float()
        ant_words = torch.from_numpy(ant_words)
        ana_spans = torch.from_numpy(ana_spans).float()
        ana_words = torch.from_numpy(ana_words)
        pairs_features = torch.from_numpy(pairs_features).float()

        labels_stack = np.concatenate((pairs_labels, label), axis=0)
        assert labels_stack.shape == (pairs_length + 1,)
        labels = torch.from_numpy(labels_stack).float()

        spans = torch.from_numpy(spans).float()
        words = torch.from_numpy(words)
        features = torch.from_numpy(features).float()

        inputs = (spans, words, features,
                  ant_spans, ant_words,
                  ana_spans, ana_words,
                  pairs_features)

        if self.no_targets:
            return inputs

        if label == 0:
            costs = np.concatenate((self.costs['WL'] * (1 - pairs_labels), [self.costs['FN']])) # Inverse labels: 1=>0, 0=>1
        else:
            costs = np.concatenate((self.costs['FL'] * np.ones_like(pairs_labels), [0]))
        assert costs.shape == (pairs_length + 1,)
        costs = torch.from_numpy(costs).float()

        true_ants_unpad = np.flatnonzero(labels_stack)
        if len(true_ants_unpad) == 0:
            raise ValueError("Error: no True antecedent for mention")
        true_ants = np.pad(true_ants_unpad, (0, len(pairs_labels) + 1 - len(true_ants_unpad)), 'edge')
        assert true_ants.shape == (pairs_length + 1,)
        true_ants = torch.from_numpy(true_ants).long()

        false_ants_unpad = np.flatnonzero(1 - labels_stack)
        assert len(false_ants_unpad) != 0
        false_ants = np.pad(false_ants_unpad, (0, len(pairs_labels) + 1 - len(false_ants_unpad)), 'edge')
        assert false_ants.shape == (pairs_length + 1,)
        false_ants = torch.from_numpy(false_ants).long()

        targets = (labels, costs, true_ants, false_ants)
        if debug:
            print("Mention", mention_idx)
            print("inputs shapes: ", [a.size() for a in inputs])
            print("targets shapes: ", [a.size() for a in targets])
        return inputs, targets