def get_pair_mentions_features(self, m1, m2): ''' Features for pair of mentions (same speakers, speaker mentioned, string match)''' features_ = {"00_SameSpeaker": 1 if self.consider_speakers and m1.speaker == m2.speaker else 0, "01_AntMatchMentionSpeaker": 1 if self.consider_speakers and m2.speaker_match_mention(m1) else 0, "02_MentionMatchSpeaker": 1 if self.consider_speakers and m1.speaker_match_mention(m2) else 0, "03_HeadsAgree": 1 if m1.heads_agree(m2) else 0, "04_ExactStringMatch": 1 if m1.exact_match(m2) else 0, "05_RelaxedStringMatch": 1 if m1.relaxed_match(m2) else 0, "06_SentenceDistance": m2.utterances_sent - m1.utterances_sent, "07_MentionDistance": m2.index - m1.index - 1, "08_Overlapping": 1 if (m1.utterances_sent == m2.utterances_sent and m1.end > m2.start) else 0, "09_M1Features": m1.features_, "10_M2Features": m2.features_, "11_DocGenre": self.genre_} pairwise_features = [np.array([features_["00_SameSpeaker"], features_["01_AntMatchMentionSpeaker"], features_["02_MentionMatchSpeaker"], features_["03_HeadsAgree"], features_["04_ExactStringMatch"], features_["05_RelaxedStringMatch"]]), encode_distance(features_["06_SentenceDistance"]), encode_distance(features_["07_MentionDistance"]), np.array(features_["08_Overlapping"], ndmin=1), m1.features, m2.features, self.genre] return (features_, np.concatenate(pairwise_features, axis=0))
def set_mentions_features(self): ''' Compute features for the extracted mentions ''' doc_embedding = self.embed_extractor.get_document_embedding(self.utterances) if self.embed_extractor is not None else None for mention in self.mentions: one_hot_type = np.zeros((4,)) one_hot_type[mention.mention_type] = 1 features_ = {"01_MentionType": mention.mention_type, "02_MentionLength": len(mention)-1, "03_MentionNormLocation": (mention.index)/len(self.mentions), "04_IsMentionNested": 1 if any((m is not mention and m.utterances_sent == mention.utterances_sent and m.start <= mention.start and mention.end <= m.end) for m in self.mentions) else 0} features = np.concatenate([one_hot_type, encode_distance(features_["02_MentionLength"]), np.array(features_["03_MentionNormLocation"], ndmin=1, copy=False), np.array(features_["04_IsMentionNested"], ndmin=1, copy=False) ], axis=0) (spans_embeddings_, words_embeddings_, spans_embeddings, words_embeddings) = self.embed_extractor.get_mention_embeddings(mention, doc_embedding) mention.features_ = features_ mention.features = features mention.spans_embeddings = spans_embeddings mention.spans_embeddings_ = spans_embeddings_ mention.words_embeddings = words_embeddings mention.words_embeddings_ = words_embeddings_
def __getitem__(self, mention_idx, debug=False): """ Return: Definitions: P is the number of antecedent per mention (number of pairs for the mention) S = 250 is the size of the span vector (averaged word embeddings) W = 8 is the number of words in a mention (tuned embeddings) Fp = 70 is the number of features for a pair of mention Fs = 24 is the number of features of a single mention if there are some pairs: inputs = (spans, words, features, ant_spans, ant_words, ana_spans, ana_words, pairs_features) targets = (labels, costs, true_ants, false_ants) else: inputs = (spans, words, features) targets = (labels, costs, true_ants) inputs: Tuple of spans => (S,) words => (W,) features => (Fs,) + if there are potential antecedents (P > 0): ant_spans => (P, S) or nothing if no pairs ant_words => (P, W) or nothing if no pairs ana_spans => (P, S) or nothing if no pairs ana_words => (P, W) or nothing if no pairs pair_features => (P, Fp) or nothing if no pairs targets: Tuple of labels => (P+1,) costs => (P+1,) true_ant => (P+1,) + if there are potential antecedents (P > 0): false_ant => (P+1,) """ features_raw, label, pairs_length, pairs_start_index, spans, words = self.mentions[ mention_idx] pairs_start_index = pairs_start_index.item() pairs_length = pairs_length.item() # Build features array (float) from raw features (int) assert features_raw.shape[0] == SIZE_FS_COMPRESSED features = np.zeros((SIZE_FS, )) features[features_raw[0]] = 1 features[4:15] = encode_distance(features_raw[1]) features[15] = features_raw[2].astype(float) / features_raw[3].astype( float) features[16] = features_raw[4] features[features_raw[5] + 17] = 1 if pairs_length == 0: spans = torch.from_numpy(spans).float() words = torch.from_numpy(words) features = torch.from_numpy(features).float() inputs = (spans, words, features) if self.no_targets: return inputs true_ant = torch.zeros(1).long() # zeros = indices of true ant costs = torch.from_numpy((1 - label) * self.costs['FN']).float() label = torch.from_numpy(label).float() targets = (label, costs, true_ant) if debug: print("inputs shapes: ", [a.size() for a in inputs]) print("targets shapes: ", [a.size() for a in targets]) return inputs, targets start = pairs_start_index end = pairs_start_index + pairs_length pairs = self.pairs[start:end] assert len(pairs) == pairs_length assert len( pairs[0] ) == 3 # pair[i] = (pairs_ant_index, pairs_features, pairs_labels) pairs_ant_index, pairs_features_raw, pairs_labels = list(zip(*pairs)) pairs_features_raw = np.stack(pairs_features_raw) pairs_labels = np.squeeze(np.stack(pairs_labels), axis=1) # Build pair features array (float) from raw features (int) assert pairs_features_raw[0, :].shape[0] == SIZE_FP_COMPRESSED pairs_features = np.zeros((len(pairs_ant_index), SIZE_FP)) pairs_features[:, 0:6] = pairs_features_raw[:, 0:6] pairs_features[:, 6:17] = encode_distance(pairs_features_raw[:, 6]) pairs_features[:, 17:28] = encode_distance(pairs_features_raw[:, 7]) pairs_features[:, 28] = pairs_features_raw[:, 8] # prepare antecent features ant_features_raw = np.concatenate([ self.mentions[idx.item()][0][np.newaxis, :] for idx in pairs_ant_index ]) ant_features = np.zeros((pairs_length, SIZE_FS - SIZE_GENRE)) ant_features[:, ant_features_raw[:, 0]] = 1 ant_features[:, 4:15] = encode_distance(ant_features_raw[:, 1]) ant_features[:, 15] = ant_features_raw[:, 2].astype( float) / ant_features_raw[:, 3].astype(float) ant_features[:, 16] = ant_features_raw[:, 4] pairs_features[:, 29:46] = ant_features # Here we keep the genre ana_features = np.tile(features, (pairs_length, 1)) pairs_features[:, 46:] = ana_features ant_spans = np.concatenate([ self.mentions[idx.item()][4][np.newaxis, :] for idx in pairs_ant_index ]) ant_words = np.concatenate([ self.mentions[idx.item()][5][np.newaxis, :] for idx in pairs_ant_index ]) ana_spans = np.tile(spans, (pairs_length, 1)) ana_words = np.tile(words, (pairs_length, 1)) ant_spans = torch.from_numpy(ant_spans).float() ant_words = torch.from_numpy(ant_words) ana_spans = torch.from_numpy(ana_spans).float() ana_words = torch.from_numpy(ana_words) pairs_features = torch.from_numpy(pairs_features).float() labels_stack = np.concatenate((pairs_labels, label), axis=0) assert labels_stack.shape == (pairs_length + 1, ) labels = torch.from_numpy(labels_stack).float() spans = torch.from_numpy(spans).float() words = torch.from_numpy(words) features = torch.from_numpy(features).float() inputs = (spans, words, features, ant_spans, ant_words, ana_spans, ana_words, pairs_features) if self.no_targets: return inputs if label == 0: costs = np.concatenate( (self.costs['WL'] * (1 - pairs_labels), [self.costs['FN']])) # Inverse labels: 1=>0, 0=>1 else: costs = np.concatenate( (self.costs['FL'] * np.ones_like(pairs_labels), [0])) assert costs.shape == (pairs_length + 1, ) costs = torch.from_numpy(costs).float() true_ants_unpad = np.flatnonzero(labels_stack) if len(true_ants_unpad) == 0: raise ValueError("Error: no True antecedent for mention") true_ants = np.pad(true_ants_unpad, (0, len(pairs_labels) + 1 - len(true_ants_unpad)), 'edge') assert true_ants.shape == (pairs_length + 1, ) true_ants = torch.from_numpy(true_ants).long() false_ants_unpad = np.flatnonzero(1 - labels_stack) assert len(false_ants_unpad) != 0 false_ants = np.pad(false_ants_unpad, (0, len(pairs_labels) + 1 - len(false_ants_unpad)), 'edge') assert false_ants.shape == (pairs_length + 1, ) false_ants = torch.from_numpy(false_ants).long() targets = (labels, costs, true_ants, false_ants) if debug: print("Mention", mention_idx) print("inputs shapes: ", [a.size() for a in inputs]) print("targets shapes: ", [a.size() for a in targets]) return inputs, targets