def test_loading_from_pretrained_weights_using_model_name( pretrained_name, relevant_module): torch.manual_seed(1234) module = T5Attention.from_pretrained_module( pretrained_name, relevant_module=relevant_module) torch.manual_seed(1234) pretrained_module = dict( AutoModel.from_pretrained( pretrained_name).named_modules())[relevant_module] batch_size = 2 seq_len = 3 dim = module.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.tensor([[1, 1, 0], [1, 0, 1]])[:, None, None, :] # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module(hidden_states, mask=attention_mask.squeeze()).hidden_states # The attn_mask is processed outside the self attention module in HF bert models. attention_mask = (~(attention_mask == 1)) * min_value_of_dtype( hidden_states.dtype) torch.manual_seed(1234) hf_output = pretrained_module(hidden_states, mask=attention_mask)[0] assert torch.allclose(output, hf_output)
def replace_masked_values_with_big_negative_number(x: torch.Tensor, mask: torch.Tensor): """ Replace the masked values in a tensor something really negative so that they won't affect a max operation. """ return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
def __init__(self, k: int = 1, temperature: float = 1.0, filter_val: float = -float("inf")): assert k >= 1, f'{"k must be >= 1"}' self.k = k self.temperature = temperature or 1.0 self.filter_val = min_value_of_dtype(torch.float)
def __init__(self, p: float = 0.9, temperature: float = 1.0, filter_val: float = -float("inf")): assert p <= 1.0, f'{"p must be <= 0"}' self.p = p self.temperature = temperature or 1.0 self.filter_val = min_value_of_dtype(torch.float)
def sample_nodes( self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType) -> Tuple[torch.Tensor, torch.Tensor, StateType]: if not per_node_beam_size <= log_probs.size()[1]: raise ValueError( "per_node_beam_size cannot be greater than vocabulary size") # First apply temperature coefficient: if self.temperature != 1.0: _log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1) else: _log_probs = log_probs # Sort the probabilities in descending order to then find cumulative sum log_probs_descending, sorting_indices = torch.sort(_log_probs, descending=True) # shape: (batch_size, num_classes) probabilities_descending = log_probs_descending.exp() probabilities_summed = torch.cumsum(probabilities_descending, dim=-1) # Create a mask for filtering out probabilities that don't make the top `p`. # shape: (batch_size, num_classes) exclusion_mask = probabilities_summed >= self.p # We want to include the first index where probabilities_summed >= p, so we shift over one. exclusion_mask[..., 1:] = exclusion_mask[..., :-1].clone() exclusion_mask[..., 0] = False # Make sure there's at least `per_node_beam_size` options to be selected. if not self.with_replacement: exclusion_mask[..., :per_node_beam_size] = False log_probs_descending[exclusion_mask] = min_value_of_dtype( log_probs.dtype) # Now re-normalized the included log probs. # shape: (batch_size, num_classes) filtered_probabilities = torch.nn.functional.softmax( log_probs_descending, dim=-1) # Sample from the re-normalized subset. # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`. # shape: (batch_size, per_node_beam_size) sampled_indices = torch.multinomial(filtered_probabilities, per_node_beam_size, replacement=self.with_replacement) # Convert `sampled_indices` back to indices in the original `log_probs` tensor. # shape: (batch_size, per_node_beam_size) selected_indices = sorting_indices.gather(-1, sampled_indices) # Return (selected log probabilities, selected classes) # shape: (len(log_probs),1) , (len(log_probs), 1) return torch.gather(log_probs, 1, selected_indices), selected_indices, state
def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. for i, pretrained_module in enumerate(encoder.layer.modules()): if i == 1: break # Get the self attention layer. if "distilbert" in pretrained_name: pretrained_module = pretrained_module.attention else: pretrained_module = pretrained_module.attention.self torch.manual_seed(1234) module = SelfAttention.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {} ).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 2 seq_len = 3 dim = module.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len)) # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] if "distilbert" in pretrained_name: torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, hidden_states, hidden_states, mask=attention_mask )[0] else: # The attn_mask is processed outside the self attention module in HF bert models. attention_mask = (~(attention_mask == 1)) * min_value_of_dtype(hidden_states.dtype) torch.manual_seed(1234) hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask)[0] assert torch.allclose(output, hf_output)
def attention(query, key, value, mask=None, dropout=None): "Compute 'Scaled Dot Product Attention'" d_k = query.size(-1) scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: scores = scores.masked_fill(mask == 0, ai2_util.min_value_of_dtype(scores.dtype)) p_attn = F.softmax(scores, dim=-1) if dropout is not None: p_attn = dropout(p_attn) # return torch.matmul(p_attn, value), scores.squeeze(1).squeeze(1) return torch.matmul(p_attn, value), p_attn
def attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.BoolTensor = None, dropout: Callable = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Compute 'Scaled Dot Product Attention'""" d_k = query.size(-1) scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: scores = scores.masked_fill(~mask, util.min_value_of_dtype(scores.dtype)) p_attn = F.softmax(scores, dim=-1) if dropout is not None: p_attn = dropout(p_attn) return torch.matmul(p_attn, value), p_attn
def attention_with_relations(query, key, value, relation_k, relation_v, mask=None, dropout=None): "Compute 'Scaled Dot Product Attention'" d_k = query.size(-1) scores = relative_attention_logits(query, key, relation_k) if mask is not None: scores = scores.masked_fill(mask == 0, ai2_util.min_value_of_dtype(scores.dtype)) p_attn_orig = F.softmax(scores, dim=-1) if dropout is not None: p_attn = dropout(p_attn_orig) return relative_attention_values(p_attn, value, relation_v), p_attn_orig
def test_repeated_ngram_blocking_constraint_apply(self): ngram_size = 3 batch_size = 2 beam_size = 2 num_classes = 10 constraint = RepeatedNGramBlockingConstraint(ngram_size) state = [ [ { "current_prefix": [0, 1], "seen_ngrams": {} }, { "current_prefix": [2, 3], "seen_ngrams": { (2, 3): [4] } }, ], [ { "current_prefix": [4, 5], "seen_ngrams": { (8, 9): [] } }, { "current_prefix": [6, 7], "seen_ngrams": { (6, 7): [0, 1, 2] } }, ], ] log_probabilities = torch.rand(batch_size, beam_size, num_classes) constraint.apply(state, log_probabilities) disallowed_locations = torch.nonzero( log_probabilities == min_value_of_dtype( log_probabilities.dtype)).tolist() assert len(disallowed_locations) == 4 assert [0, 1, 4] in disallowed_locations assert [1, 1, 0] in disallowed_locations assert [1, 1, 1] in disallowed_locations assert [1, 1, 2] in disallowed_locations
def _get_best_span_yesno_followup( span_start_logits: torch.Tensor, span_end_logits: torch.Tensor, span_yesno_logits: torch.Tensor, span_followup_logits: torch.Tensor, max_span_length: int, ) -> torch.Tensor: # Returns the index of highest-scoring span that is not longer than 30 tokens, as well as # yesno prediction bit and followup prediction bit from the predicted span end token. if span_start_logits.dim() != 2 or span_end_logits.dim() != 2: raise ValueError( "Input shapes must be (batch_size, passage_length)") batch_size, passage_length = span_start_logits.size() max_span_log_prob = [util.min_value_of_dtype(span_start_logits.dtype) ] * batch_size span_start_argmax = [0] * batch_size best_word_span = span_start_logits.new_zeros((batch_size, 4), dtype=torch.long) span_start_logits = span_start_logits.data.cpu().numpy() span_end_logits = span_end_logits.data.cpu().numpy() span_yesno_logits = span_yesno_logits.data.cpu().numpy() span_followup_logits = span_followup_logits.data.cpu().numpy() for b_i in range(batch_size): for j in range(passage_length): val1 = span_start_logits[b_i, span_start_argmax[b_i]] if val1 < span_start_logits[b_i, j]: span_start_argmax[b_i] = j val1 = span_start_logits[b_i, j] val2 = span_end_logits[b_i, j] if val1 + val2 > max_span_log_prob[b_i]: if j - span_start_argmax[b_i] > max_span_length: continue best_word_span[b_i, 0] = span_start_argmax[b_i] best_word_span[b_i, 1] = j max_span_log_prob[b_i] = val1 + val2 for b_i in range(batch_size): j = best_word_span[b_i, 1] yesno_pred = np.argmax(span_yesno_logits[b_i, j]) followup_pred = np.argmax(span_followup_logits[b_i, j]) best_word_span[b_i, 2] = int(yesno_pred) best_word_span[b_i, 3] = int(followup_pred) return best_word_span
def forward(self, hidden_states, context, attention_mask=None, output_attentions=False): mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(context) mixed_value_layer = self.value(context) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt( self.attention_head_size) # Apply the attention mask is (precomputed for all layers in BertModel forward() function) if attention_mask is not None: # print(attention_scores.shape) # print(attention_mask.shape) attention_mask = attention_mask.bool().unsqueeze(1).unsqueeze(1) attention_scores = attention_scores.clone().masked_fill( ~attention_mask, ai2_util.min_value_of_dtype(attention_scores.dtype)) # attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + ( self.head_size, ) context_layer = context_layer.view(*new_context_layer_shape) outputs = ((context_layer, attention_probs) if output_attentions else (context_layer, )) return outputs
def test_loading_from_pretrained_weights_using_model_name( pretrained_name, relevant_module): torch.manual_seed(1234) module = SelfAttention.from_pretrained_module( pretrained_name, relevant_module=relevant_module) torch.manual_seed(1234) pretrained_module = dict( AutoModel.from_pretrained(pretrained_name).named_modules() )[ # Module name will exclude the top-level part (e.g. 'bert.', 'electra.') for some reason. relevant_module[relevant_module.index(".") + 1:]] batch_size = 2 seq_len = 3 dim = module.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.tensor([[1, 1, 0], [1, 0, 1]])[:, None, None, :] # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module(hidden_states, attention_mask=attention_mask.squeeze()).hidden_states if "distilbert" in pretrained_name: torch.manual_seed(1234) hf_output = pretrained_module(hidden_states, hidden_states, hidden_states, mask=attention_mask)[0] else: # The attn_mask is processed outside the self attention module in HF bert models. attention_mask = (~(attention_mask == 1)) * min_value_of_dtype( hidden_states.dtype) torch.manual_seed(1234) hf_output = pretrained_module(hidden_states, attention_mask=attention_mask)[0] assert torch.allclose(output, hf_output)
def test_forward_against_huggingface_output(params_dict): hidden_states = torch.randn(2, 3, 6) attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) hf_kwargs = { "d_model": params_dict["hidden_size"], "d_kv": params_dict["key_value_proj_dim"], "num_heads": params_dict["num_heads"], "relative_attention_num_buckets": params_dict["relative_attention_num_buckets"], "dropout_rate": params_dict["dropout"], } torch.manual_seed(1234) hf_module = HFT5Attention(T5Config(**hf_kwargs), has_relative_attention_bias=False) torch.manual_seed(1234) params = copy.deepcopy(params_dict) params[ "normalize"] = False # only for this test, as HF does not normalize. t5_attention = T5Attention(**params) # setting to eval mode to avoid non-deterministic dropout. t5_attention = t5_attention.eval() hf_module = hf_module.eval() output = t5_attention.forward(hidden_states, mask=attention_mask) attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand( 2, 2, 3, 3) * min_value_of_dtype(hidden_states.dtype) hf_output = hf_module.forward(hidden_states, mask=attention_mask_hf) hs = output.hidden_states assert torch.allclose(hs, hf_output[0])
def apply_mask( values: torch.FloatTensor, mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor] ) -> torch.FloatTensor: """ # Parameters values : `torch.FloatTensor` Shape `batch_size x num_attention_heads x source_seq_len x target_seq_len` mask : `torch.BoolTensor` Shape `batch_size x target_seq_len` OR `batch_size x 1 x 1 x target_seq_len` """ # We create a 4D attention mask from a 2D or 3D tensor mask. if mask.dim() == 2: # The shape is `batch_size x 1 x 1 x target_seq_len` which is broadcast # to `batch_size x num_attention_heads x source_seq_len x target_seq_len` mask = mask[:, None, None, :] elif mask.dim() == 3: mask = mask[:, None, :, :] mask = mask.to(values.dtype) mask = (1.0 - mask) * min_value_of_dtype(values.dtype) return values + mask
def replace_masked_values_with_big_negative_number(x: torch.Tensor, mask: torch.Tensor): """ mask.dim() should be equal to x.dim() """ return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
def forward( self, # type: ignore tokens: TextFieldTensors, pos_tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, arc_tags: torch.LongTensor = None, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : TextFieldTensors, required The output of `TextField.as_array()`. pos_tags : torch.LongTensor, optional (default = None) The output of a `SequenceLabelField` containing POS tags. metadata : List[Dict[str, Any]], optional (default = None) A dictionary of metadata for each batch element which has keys: tokens : `List[str]`, required. The original string tokens in the sentence. arc_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer indices denoting the parent of every word in the dependency parse. Has shape `(batch_size, sequence_length, sequence_length)`. # Returns An output dictionary. """ embedded_text_input = self.text_field_embedder(tokens) if pos_tags is not None and self._pos_tag_embedding is not None: embedded_pos_tags = self._pos_tag_embedding(pos_tags) embedded_text_input = torch.cat( [embedded_text_input, embedded_pos_tags], -1) elif self._pos_tag_embedding is not None: raise ConfigurationError( "Model uses a POS embedding, but no POS tags were passed.") mask = get_text_field_mask(tokens) embedded_text_input = self._input_dropout(embedded_text_input) encoded_text = self.encoder(embedded_text_input, mask) batch_size, _, encoding_dim = encoded_text.size() head_sentinel = self._head_sentinel.expand(batch_size, 1, encoding_dim) # Concatenate the head sentinel onto the sentence representation. encoded_text = torch.cat([head_sentinel, encoded_text], 1) mask = torch.cat([mask.new_ones(batch_size, 1), mask], 1) encoded_text = self._dropout(encoded_text) # shape (batch_size, sequence_length, arc_representation_dim) head_arc_representation = self._dropout( self.head_arc_feedforward(encoded_text)) child_arc_representation = self._dropout( self.child_arc_feedforward(encoded_text)) # shape (batch_size, sequence_length, tag_representation_dim) head_tag_representation = self._dropout( self.head_tag_feedforward(encoded_text)) child_tag_representation = self._dropout( self.child_tag_feedforward(encoded_text)) # shape (batch_size, sequence_length, sequence_length) arc_scores = self.arc_attention(head_arc_representation, child_arc_representation) # shape (batch_size, num_tags, sequence_length, sequence_length) arc_tag_logits = self.tag_bilinear(head_tag_representation, child_tag_representation) # Switch to (batch_size, sequence_length, sequence_length, num_tags) arc_tag_logits = arc_tag_logits.permute(0, 2, 3, 1).contiguous() # Since we'll be doing some additions, using the min value will cause underflow minus_mask = ~mask * min_value_of_dtype(arc_scores.dtype) / 10 arc_scores = arc_scores + minus_mask.unsqueeze( 2) + minus_mask.unsqueeze(1) arc_probs, arc_tag_probs = self._greedy_decode(arc_scores, arc_tag_logits, mask) output_dict = { "arc_probs": arc_probs, "arc_tag_probs": arc_tag_probs, "mask": mask } if metadata: output_dict["tokens"] = [meta["tokens"] for meta in metadata] if arc_tags is not None: arc_nll, tag_nll = self._construct_loss( arc_scores=arc_scores, arc_tag_logits=arc_tag_logits, arc_tags=arc_tags, mask=mask) output_dict["loss"] = arc_nll + tag_nll output_dict["arc_loss"] = arc_nll output_dict["tag_loss"] = tag_nll # Make the arc tags not have negative values anywhere # (by default, no edge is indicated with -1). arc_indices = (arc_tags != -1).float() tag_mask = mask.unsqueeze(1) & mask.unsqueeze(2) one_minus_arc_probs = 1 - arc_probs # We stack scores here because the f1 measure expects a # distribution, rather than a single value. self._unlabelled_f1( torch.stack([one_minus_arc_probs, arc_probs], -1), arc_indices, tag_mask) return output_dict
def replace_masked_values_with_big_negative_number(x: torch.Tensor, mask: torch.Tensor): return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
def forward( self, # type: ignore tokens: TextFieldTensors, lemmas: torch.LongTensor = None, upos: torch.LongTensor = None, xpos: torch.LongTensor = None, feats: torch.LongTensor = None, deprels: torch.LongTensor = None, heads: torch.LongTensor = None, enhanced_tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : TextFieldTensors, required The output of `TextField.as_array()`. pos_tags : torch.LongTensor, optional (default = None) The output of a `SequenceLabelField` containing POS tags. metadata : List[Dict[str, Any]], optional (default = None) A dictionary of metadata for each batch element which has keys: tokens : `List[str]`, required. The original string tokens in the sentence. enhanced_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer indices denoting the parent of every word in the dependency parse. Has shape ``(batch_size, sequence_length, sequence_length)``. # Returns An output dictionary. """ embedded_text_input = self.text_field_embedder(tokens) concatenated_input = [embedded_text_input] if upos is not None and self._upos_tag_embedding is not None: concatenated_input.append(self._upos_tag_embedding(upos)) elif self._upos_tag_embedding is not None: raise ConfigurationError( "Model uses a POS embedding, but no POS tags were passed.") if lemmas is not None and self._lemma_tag_embedding is not None: concatenated_input.append(self._lemma_tag_embedding(lemmas)) if xpos is not None and self._xpos_tag_embedding is not None: concatenated_input.append(self._xpos_tag_embedding(xpos)) if feats is not None and self._feats_tag_embedding is not None: batch_size, sequence_len, max_len = feats.size() # shape: (batch, seq_len, max_len) feats_mask = (feats != -1).long() feats = feats * feats_mask # tensor corresponding to the number of active components, e.g. morphological features number_active_components = feats_mask.sum(-1) # a padding token's summed vector will be filled with 0s and when this is divided by 0 # it will return a NaN so we replaces 0s with 1s in the denominator tensor to avoid this. number_active_components[number_active_components == 0] = 1 feats_embeddings = [] # shape: (seq_len, max_len) for feat_tensor in feats: # shape: (seq_len, max_len, emb_dim) embedded_feats = self._feats_tag_embedding(feat_tensor) feats_embeddings.append(embedded_feats) # shape: (batch, seq_len, max_len, emb_dim) stacked_feats_tensor = torch.stack(feats_embeddings) tag_embedding_dim = stacked_feats_tensor.size(-1) feats_mask_expanded = feats_mask.unsqueeze_(-1).expand( batch_size, sequence_len, max_len, tag_embedding_dim) # shape: (batch, seq_len, max_len, emb_dim) masked_feats = stacked_feats_tensor * feats_mask_expanded # shape: (batch, seq_len, tag_embedding_dim) combined_masked_feats = masked_feats.sum(2) expanded_number_active_components = number_active_components.unsqueeze( -1).expand(batch_size, sequence_len, tag_embedding_dim) # divide the summed feats vectors by the number of non-padded elements averaged_feats = combined_masked_feats / expanded_number_active_components concatenated_input.append(averaged_feats) if deprels is not None and self._head_tag_embedding is not None: concatenated_input.append(self._head_tag_embedding(deprels)) # TODO BASIC TREE if heads is not None and self._head_information_embedding is not None: batch_size, sequence_len, max_len = heads.size() # shape: (batch, seq_len, max_len) head_information_mask = (heads != -1).long() heads = heads * head_information_mask # tensor corresponding to the number of active components, e.g. morphological features number_active_components = head_information_mask.sum(-1) # a padding token's summed vector will be filled with 0s and when this is divided by 0 # it will return a NaN so we replaces 0s with 1s in the denominator tensor to avoid this. number_active_components[number_active_components == 0] = 1 head_information_embeddings = [] # shape: (seq_len, max_len) for head_information_tensor in heads: # shape: (seq_len, max_len, emb_dim) embedded_head_information = self._head_information_embedding( head_information_tensor) head_information_embeddings.append(embedded_head_information) # shape: (batch, seq_len, max_len, emb_dim) stacked_head_information_tensor = torch.stack( head_information_embeddings) tag_embedding_dim = stacked_head_information_tensor.size(-1) head_information_mask_expanded = head_information_mask.unsqueeze_( -1).expand(batch_size, sequence_len, max_len, tag_embedding_dim) # shape: (batch, seq_len, max_len, emb_dim) masked_head_information = stacked_head_information_tensor * head_information_mask_expanded # shape: (batch, seq_len, tag_embedding_dim) combined_masked_head_information = masked_head_information.sum(2) expanded_number_active_components = number_active_components.unsqueeze( -1).expand(batch_size, sequence_len, tag_embedding_dim) # divide the summed head information vectors by the number of non-padded elements averaged_head_information = combined_masked_head_information / expanded_number_active_components concatenated_input.append(averaged_head_information) if len(concatenated_input) > 1: embedded_text_input = torch.cat(concatenated_input, -1) mask = get_text_field_mask(tokens) embedded_text_input = self._input_dropout(embedded_text_input) encoded_text = self.encoder(embedded_text_input, mask) batch_size, _, encoding_dim = encoded_text.size() head_sentinel = self._head_sentinel.expand(batch_size, 1, encoding_dim) # Concatenate the head sentinel onto the sentence representation. encoded_text = torch.cat([head_sentinel, encoded_text], 1) mask = torch.cat([mask.new_ones(batch_size, 1), mask], 1) encoded_text = self._dropout(encoded_text) # shape (batch_size, sequence_length, arc_representation_dim) head_arc_representation = self._dropout( self.head_arc_feedforward(encoded_text)) child_arc_representation = self._dropout( self.child_arc_feedforward(encoded_text)) # shape (batch_size, sequence_length, tag_representation_dim) head_tag_representation = self._dropout( self.head_tag_feedforward(encoded_text)) child_tag_representation = self._dropout( self.child_tag_feedforward(encoded_text)) # shape (batch_size, sequence_length, sequence_length) arc_scores = self.arc_attention(head_arc_representation, child_arc_representation) # shape (batch_size, num_tags, sequence_length, sequence_length) arc_tag_logits = self.tag_bilinear(head_tag_representation, child_tag_representation) # Switch to (batch_size, sequence_length, sequence_length, num_tags) arc_tag_logits = arc_tag_logits.permute(0, 2, 3, 1).contiguous() # Since we'll be doing some additions, using the min value will cause underflow minus_mask = ~mask * min_value_of_dtype(arc_scores.dtype) / 10 arc_scores = arc_scores + minus_mask.unsqueeze( 2) + minus_mask.unsqueeze(1) arc_probs, arc_tag_probs = self._greedy_decode(arc_scores, arc_tag_logits, mask) output_dict = { "arc_probs": arc_probs, "arc_tag_probs": arc_tag_probs, "mask": mask } if metadata: output_dict["conllu_metadata"] = [ meta["conllu_metadata"] for meta in metadata ] output_dict["ids"] = [meta["ids"] for meta in metadata] output_dict["tokens"] = [meta["tokens"] for meta in metadata] output_dict["lemmas"] = [meta["lemmas"] for meta in metadata] output_dict["upos"] = [meta["upos_tags"] for meta in metadata] output_dict["xpos"] = [meta["xpos_tags"] for meta in metadata] output_dict["feats"] = [meta["feats"] for meta in metadata] output_dict["head_tags"] = [meta["head_tags"] for meta in metadata] output_dict["head_indices"] = [ meta["head_indices"] for meta in metadata ] output_dict["original_to_new_indices"] = [ meta["original_to_new_indices"] for meta in metadata ] output_dict["misc"] = [meta["misc"] for meta in metadata] output_dict["multiword_ids"] = [ x["multiword_ids"] for x in metadata if "multiword_ids" in x ] output_dict["multiword_forms"] = [ x["multiword_forms"] for x in metadata if "multiword_forms" in x ] if enhanced_tags is not None: arc_nll, tag_nll = self._construct_loss( arc_scores=arc_scores, arc_tag_logits=arc_tag_logits, enhanced_tags=enhanced_tags, mask=mask) output_dict["loss"] = arc_nll + tag_nll output_dict["arc_loss"] = arc_nll output_dict["tag_loss"] = tag_nll # get human readable output to computed enhanced graph metrics output_dict = self.make_output_human_readable(output_dict) # predicted arcs, arc_tags predicted_arcs = output_dict["arcs"] predicted_arc_tags = output_dict["arc_tags"] predicted_labeled_arcs = output_dict["labeled_arcs"] # gold arcs, arc_tags gold_arcs = [meta["arc_indices"] for meta in metadata] gold_arc_tags = [meta["arc_tags"] for meta in metadata] gold_labeled_arcs = [meta["labeled_arcs"] for meta in metadata] tag_mask = mask.unsqueeze(1) & mask.unsqueeze(2) self._enhanced_attachment_scores(predicted_arcs, predicted_arc_tags, predicted_labeled_arcs, \ gold_arcs, gold_arc_tags, gold_labeled_arcs, tag_mask) return output_dict
def forward( self, # type: ignore tokens: TextFieldTensors, label: torch.LongTensor = None, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : `TextFieldTensors`, required The output of `TextField.as_array()`. label : `torch.LongTensor`, optional (default = `None`) A variable representing the label for each instance in the batch. # Returns An output dictionary consisting of: - `class_probabilities` (`torch.FloatTensor`) : A tensor of shape `(batch_size, num_classes)` representing a distribution over the label classes for each instance. - `loss` (`torch.FloatTensor`, optional) : A scalar loss to be optimised. """ text_mask = util.get_text_field_mask(tokens) # Pop elmo tokens, since elmo embedder should not be present. elmo_tokens = tokens.pop("elmo", None) if tokens: embedded_text = self._text_field_embedder(tokens) else: # only using "elmo" for input embedded_text = None # Add the "elmo" key back to "tokens" if not None, since the tests and the # subsequent training epochs rely not being modified during forward() if elmo_tokens is not None: tokens["elmo"] = elmo_tokens # Create ELMo embeddings if applicable if self._elmo: if elmo_tokens is not None: elmo_representations = self._elmo(elmo_tokens["elmo_tokens"])[ "elmo_representations" ] # Pop from the end is more performant with list if self._use_integrator_output_elmo: integrator_output_elmo = elmo_representations.pop() if self._use_input_elmo: input_elmo = elmo_representations.pop() assert not elmo_representations else: raise ConfigurationError( "Model was built to use Elmo, but input text is not tokenized for Elmo." ) if self._use_input_elmo: if embedded_text is not None: embedded_text = torch.cat([embedded_text, input_elmo], dim=-1) else: embedded_text = input_elmo dropped_embedded_text = self._embedding_dropout(embedded_text) pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text) encoded_tokens = self._encoder(pre_encoded_text, text_mask) # Compute biattention. This is a special case since the inputs are the same. attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) attention_weights = util.masked_softmax(attention_logits, text_mask) encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # Build the input to the integrator integrator_input = torch.cat( [encoded_tokens, encoded_tokens - encoded_text, encoded_tokens * encoded_text], 2 ) integrated_encodings = self._integrator(integrator_input, text_mask) # Concatenate ELMo representations to integrated_encodings if specified if self._use_integrator_output_elmo: integrated_encodings = torch.cat([integrated_encodings, integrator_output_elmo], dim=-1) # Simple Pooling layers max_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), util.min_value_of_dtype(integrated_encodings.dtype), ) max_pool = torch.max(max_masked_integrated_encodings, 1)[0] min_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), util.max_value_of_dtype(integrated_encodings.dtype), ) min_pool = torch.min(min_masked_integrated_encodings, 1)[0] mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True) # Self-attentive pooling layer # Run through linear projection. Shape: (batch_size, sequence length, 1) # Then remove the last dimension to get the proper attention shape (batch_size, sequence length). self_attentive_logits = self._self_attentive_pooling_projection( integrated_encodings ).squeeze(2) self_weights = util.masked_softmax(self_attentive_logits, text_mask) self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights) pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1) pooled_representations_dropped = self._integrator_dropout(pooled_representations) logits = self._output_layer(pooled_representations_dropped) class_probabilities = F.softmax(logits, dim=-1) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if label is not None: loss = self.loss(logits, label) for metric in self.metrics.values(): metric(logits, label) output_dict["loss"] = loss return output_dict
def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor): if mask is not None: tokens = tokens * mask.unsqueeze(-1) else: # If mask doesn't exist create one of shape (batch_size, num_tokens) mask = torch.ones(tokens.shape[0], tokens.shape[1], device=tokens.device).bool() # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`. The # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`, # where the conv layer `in_channels` is our `embedding_dim`. We thus need to transpose the # tensor first. tokens = torch.transpose(tokens, 1, 2) # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`, # where `pool_length = num_tokens - ngram_size + 1`. We then do an activation function, # masking, then do max pooling over each filter for the whole input sequence. # Because our max pooling is simple, we just use `torch.max`. The resultant tensor has shape # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the # projection layer, if requested. # To ensure the cnn_encoder respects masking we add a large negative value to # the activations of all filters that convolved over a masked token. We do this by # first enumerating all filters for a given convolution size (torch.arange()) # then by comparing it to an index of the last filter that does not involve a masked # token (.ge()) and finally adjusting dimensions to allow for addition and multiplying # by a large negative value (.unsqueeze()) filter_outputs = [] batch_size = tokens.shape[0] # shape: (batch_size, 1) last_unmasked_tokens = mask.sum(dim=1).unsqueeze(dim=-1) for i in range(len(self._convolution_layers)): convolution_layer = getattr(self, "conv_layer_{}".format(i)) pool_length = tokens.shape[2] - convolution_layer.kernel_size[0] + 1 # Forward pass of the convolutions. # shape: (batch_size, num_filters, pool_length) activations = self._activation(convolution_layer(tokens)) # Create activation mask. # shape: (batch_size, pool_length) indices = (torch.arange( pool_length, device=activations.device).unsqueeze(0).expand( batch_size, pool_length)) # shape: (batch_size, pool_length) activations_mask = indices.ge(last_unmasked_tokens - convolution_layer.kernel_size[0] + 1) # shape: (batch_size, num_filters, pool_length) activations_mask = activations_mask.unsqueeze(1).expand_as( activations) # Replace masked out values with smallest possible value of the dtype so # that max pooling will ignore these activations. # shape: (batch_size, pool_length) activations = activations + (activations_mask * min_value_of_dtype(activations.dtype)) # Pick out the max filters filter_outputs.append(activations.max(dim=2)[0]) # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`. # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`. maxpool_output = (torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]) # Replace the maxpool activations that picked up the masks with 0s maxpool_output[maxpool_output == min_value_of_dtype( maxpool_output.dtype)] = 0.0 if self.projection_layer: result = self.projection_layer(maxpool_output) else: result = maxpool_output return result
def _compute_coreference_scores( self, top_span_embeddings: torch.FloatTensor, top_antecedent_embeddings: torch.FloatTensor, top_partial_coreference_scores: torch.FloatTensor, top_antecedent_mask: torch.BoolTensor, top_antecedent_offsets: torch.FloatTensor, ) -> torch.FloatTensor: """ Computes scores for every pair of spans. Additionally, a dummy label is included, representing the decision that the span is not coreferent with anything. For the dummy label, the score is always zero. For the true antecedent spans, the score consists of the pairwise antecedent score and the unary mention scores for the span and its antecedent. The factoring allows the model to blame many of the absent links on bad spans, enabling the pruning strategy used in the forward pass. # Parameters top_span_embeddings : `torch.FloatTensor`, required. Embedding representations of the kept spans. Has shape (batch_size, num_spans_to_keep, embedding_size) top_antecedent_embeddings: `torch.FloatTensor`, required. The embeddings of antecedents for each span candidate. Has shape (batch_size, num_spans_to_keep, max_antecedents, embedding_size) top_partial_coreference_scores : `torch.FloatTensor`, required. Sum of span mention score and antecedent mention score. The coarse to fine settings has an additional term which is the coarse bilinear score. (batch_size, num_spans_to_keep, max_antecedents). top_antecedent_mask : `torch.BoolTensor`, required. The mask for valid antecedents. (batch_size, num_spans_to_keep, max_antecedents). top_antecedent_offsets : `torch.FloatTensor`, required. The distance between the span and each of its antecedents in terms of the number of considered spans (i.e not the word distance between the spans). (batch_size, num_spans_to_keep, max_antecedents). # Returns coreference_scores : `torch.FloatTensor` A tensor of shape (batch_size, num_spans_to_keep, max_antecedents + 1), representing the unormalised score for each (span, antecedent) pair we considered. """ # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size) span_pair_embeddings = self._compute_span_pair_embeddings( top_span_embeddings, top_antecedent_embeddings, top_antecedent_offsets ) # Shape: (batch_size, num_spans_to_keep, max_antecedents) antecedent_scores = self._antecedent_scorer( self._antecedent_feedforward(span_pair_embeddings) ).squeeze(-1) antecedent_scores += top_partial_coreference_scores antecedent_scores = util.replace_masked_values( antecedent_scores, top_antecedent_mask, util.min_value_of_dtype(antecedent_scores.dtype) ) # Shape: (batch_size, num_spans_to_keep, 1) shape = [antecedent_scores.size(0), antecedent_scores.size(1), 1] dummy_scores = antecedent_scores.new_zeros(*shape) # Shape: (batch_size, num_spans_to_keep, max_antecedents + 1) coreference_scores = torch.cat([dummy_scores, antecedent_scores], -1) return coreference_scores
def __init__(self, temperature: float = 1.0, filter_val: float = -float("inf")) -> None: self.temperature = temperature self.filter_val = min_value_of_dtype(torch.float)