def test_advanced_inputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.use_cache = False inputs_dict["input_ids"][:, -2:] = config.pad_token_id decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_bart_decoder_inputs( config, inputs_dict["input_ids"] ) model = BartModel(config).to(torch_device).eval() decoder_features_with_created_mask = model(**inputs_dict)[0] decoder_features_with_passed_mask = model( decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict )[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance(decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual( decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model) ) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long() )[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
def forward(self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=False, visual=None): # check attention mask and invert if attention_mask is not None: attention_mask = invert_mask(attention_mask) inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale if self.visual is not None: visual = self.visual inputs_embeds = torch.cat([visual, inputs_embeds], dim=1) visual_zeros = torch.zeros( [visual.size()[0], visual.size()[1]], dtype=input_ids.dtype).to(torch.device("cuda")) embed_pos = self.embed_positions( torch.cat([visual_zeros, input_ids], dim=1)) x = inputs_embeds + embed_pos x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] if output_hidden_states else None all_attentions = () if output_attentions else None for encoder_layer in self.layers: if output_hidden_states: encoder_states.append(x) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): # skip the layer attn = None else: x, attn = encoder_layer(x, attention_mask, output_attentions=output_attentions) if output_attentions: all_attentions = all_attentions + (attn, ) if self.layer_norm: x = self.layer_norm(x) if output_hidden_states: encoder_states.append(x) # T x B x C -> B x T x C encoder_states = tuple( hidden_state.transpose(0, 1) for hidden_state in encoder_states) # T x B x C -> B x T x C x = x.transpose(0, 1) if return_tuple: return tuple(v for v in [x, encoder_states, all_attentions] if v is not None) return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
def forward(self, input_ids, encoder_hidden_states, encoder_padding_mask, decoder_padding_mask, decoder_causal_mask, decoder_cached_states=None, use_cache=False, **unused): """ Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: input_ids (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_hidden_states: output from the encoder, used for encoder-side attention encoder_padding_mask: for ignoring pad tokens decoder_cached_states (dict or None): dictionary used for storing state during generation Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - hidden states - attentions """ # check attention mask and invert if encoder_padding_mask is not None: encoder_padding_mask = invert_mask(encoder_padding_mask) # embed positions positions = self.embed_positions(input_ids, use_cache=use_cache) if use_cache: input_ids = input_ids[:, -1:] positions = positions[:, -1:] # happens after we embed them # assert input_ids.ne(self.padding_idx).any() x = self.embed_tokens(input_ids) * self.embed_scale x += positions x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) # decoder layers all_hidden_states = () all_self_attns = () all_cross_attns = () next_decoder_cache = [] for idx, decoder_layer in enumerate(self.layers): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if self.output_hidden_states: all_hidden_states += (x, ) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue layer_state = decoder_cached_states[ idx] if decoder_cached_states is not None else None x, layer_self_attn, layer_cross_attention, layer_past = decoder_layer( x, encoder_hidden_states, encoder_attn_mask=encoder_padding_mask, decoder_padding_mask=decoder_padding_mask, layer_state=layer_state, causal_mask=decoder_causal_mask, ) if use_cache: next_decoder_cache.append(layer_past.copy()) if self.layer_norm and (idx == len(self.layers) - 1): # last layer of mbart x = self.layer_norm(x) if self.output_attentions: all_self_attns += (layer_self_attn, ) all_cross_attns += (layer_cross_attention, ) # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) all_hidden_states = [ hidden_state.transpose(0, 1) for hidden_state in all_hidden_states ] x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) if use_cache: next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache) else: next_cache = None return x, next_cache, all_hidden_states, list(all_self_attns), list( all_cross_attns)