class BasicNATDecoder(FairseqDecoder): """ - casual attention without mask - diagonal attention mask - position-self attention """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.mask_future = not args.attn_use_future self.mask_self = not args.attn_use_self input_embed_dim = args.decoder_embed_dim embed_dim = args.decoder_embed_dim self.no_encoder_attn = no_encoder_attn self.output_embed_dim = args.decoder_output_dim self.max_target_positions = args.max_target_positions self.embed_scale = math.sqrt(embed_dim) self.bridge = LengthPredictorBridge(args, dictionary=dictionary, max_offset=args.bridge_max_offset) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_tokens = None self.decoder_layers = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not args.share_decoder_input_output_embed: self.embed_out = OutLinear(self.output_embed_dim, len(dictionary), bias=False, out_norm=args.out_norm) else: self.embed_tokens = embed_tokens self.register_buffer('version', torch.Tensor([2])) # append for basic non-auto self.use_enc_last = args.use_enc_last self._build_inner_layers(args) def forward(self, prev_output_tokens, encoder_out=None, **unused): x, extra = self.extract_features(prev_output_tokens, encoder_out, **unused) x = self.output_layer(x) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, **unused): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs length-predict module output position-search module output position-predict module output decoder module output """ # embed positions inputs_dict = self._bridging(encoder_out=encoder_out, prev_output_tokens=prev_output_tokens) x = inputs_dict['inputs'] if self.project_in_dim is not None: x = self.project_in_dim(x) positions = positional_encodings_like(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C # x = x.transpose(0, 1) # attn = None # inner_states = [x] # decoder layers # self_attn_masks = self._buffered_nat_mask(x) # self_attn_padding_mask = (1 - inputs_dict[LengthPredictorBridge.DECODE_MASK_KEY]).byte() # for layer in self.decoder_layers: # x, attn = layer( # x, # encoder_out=encoder_out['encoder_out'] if encoder_out is not None else None, # encoder_padding_mask=encoder_out['encoder_padding_mask'] if encoder_out is not None else None, # self_attn_mask=self_attn_masks, # self_attn_padding_mask=self_attn_padding_mask # ) # inner_states.append(x) # # if self.normalize: # x = self.layer_norm(x) # # # T x B x C -> B x T x C # x = x.transpose(0, 1) # # if self.project_out_dim is not None: # x = self.project_out_dim(x) # # inputs_dict['attn'] = attn # inputs_dict['inner_states'] = inner_states x, inputs_dict = self._decoding( x, encoder_out, inputs_dict, ) return x, inputs_dict def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return self.embed_tokens(features) else: return self.embed_out(features) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" # if self.embed_positions is None: return self.max_target_positions # return min(self.max_target_positions, self.embed_positions.max_positions()) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" for i in range(len(self.decoder_layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict def get_normalized_probs(self, net_output, log_probs, sample, adaptive_softmax=True): """Get normalized probabilities (or log probs) from a net's output.""" if adaptive_softmax: if hasattr( self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out # judge for extend the previous logits = net_output[0] if isinstance(net_output, list) else net_output if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) else: return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) # append for non-auto-regressive decoder def _build_inner_layers(self, args): self.decoder_layers = nn.ModuleList([]) self.decoder_layers.extend([ BasicNATDecoderLayer(args, args.decoder_pos_attn, self.no_encoder_attn) for _ in range(args.decoder_layers) ]) def _bridging( self, prev_output_tokens, encoder_out=None, ): return self.bridge.forward(encoder_out=encoder_out, prev_output_tokens=prev_output_tokens) def _decoding(self, x, enc_dict, inputs_dict, **unused): x = x.transpose(0, 1) attn = None inner_states = [x] attn_states = [attn] self_attn_masks = self._buffered_nat_mask(x) self_attn_padding_mask = ( 1 - inputs_dict[LengthPredictorBridge.DECODE_MASK_KEY]).byte() encoder_padding_mask = enc_dict[ 'encoder_padding_mask'] if enc_dict is not None else None for idx, layer in enumerate(self.decoder_layers): enc = enc_dict['encoder_out'] if self.use_enc_last else enc_dict[ 'encoder_history'][idx + 1] x, attn = layer(x, encoder_out=enc, encoder_padding_mask=encoder_padding_mask, self_attn_mask=self_attn_masks, self_attn_padding_mask=self_attn_padding_mask, **unused) inner_states.append(x) attn_states.append(attn) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) inputs_dict['attn'] = attn inputs_dict['inner_states'] = inner_states inputs_dict['attn_states'] = attn_states return x, inputs_dict def _buffered_nat_mask(self, key): sequence_length = key.size(0) self_masks = key.data.new(sequence_length, sequence_length).fill_(0).float() if self.mask_future: self_masks.fill_(float('-inf')).triu(1) if self.mask_self: diag_masks = torch.eye(sequence_length) if key.is_cuda: diag_masks = diag_masks.cuda(key.get_device()) self_masks = self_masks + float('-inf') * diag_masks return self_masks
class FConvDecoder(FairseqIncrementalDecoder): """Convolutional decoder""" def __init__( self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, normalization_constant=0.5, left_pad=False, ): super().__init__(dictionary) self._init(dictionary, embed_dim=embed_dim, embed_dict=embed_dict, out_embed_dim=out_embed_dim, max_positions=max_positions, convolutions=convolutions, attention=attention, dropout=dropout, share_embed=share_embed, positional_embeddings=positional_embeddings, adaptive_softmax_cutoff=adaptive_softmax_cutoff, normalization_constant=normalization_constant, left_pad=left_pad,) def _init(self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, normalization_constant=0.5, left_pad=False,): self.register_buffer('version', torch.Tensor([2])) self.dropout = dropout self.normalization_constant = normalization_constant self.left_pad = left_pad convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError('Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=self.left_pad, ) if positional_embeddings else None self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for i, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append(Linear(residual_dim, out_channels) if residual_dim != out_channels else None) self.convolutions.append( LinearizedConv1d(in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout) ) self.attention.append(AttentionLayer(out_channels, embed_dim, self.normalization_constant) if attention[i] else None) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.adaptive_softmax = None self.fc2 = self.fc3 = None if adaptive_softmax_cutoff is not None: assert not share_embed self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff, dropout=dropout) else: self.fc2 = Linear(in_channels, out_embed_dim) if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.fc3 = nn.Linear(out_embed_dim, num_embeddings) self.fc3.weight = self.embed_tokens.weight else: self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) def forward(self, prev_output_tokens, encoder_out_dict=None, incremental_state=None): if encoder_out_dict is not None: encoder_out = encoder_out_dict['encoder_out'] encoder_padding_mask = encoder_out_dict['encoder_padding_mask'] # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out(encoder_out, incremental_state) if self.embed_positions is not None: pos_embed = self.embed_positions(prev_output_tokens, incremental_state) else: pos_embed = 0 if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] x = self._embed_tokens(prev_output_tokens, incremental_state) # embed tokens and combine with positional embeddings x += pos_embed x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_if_training(x, incremental_state) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) residuals = [x] for proj, conv, attention, res_layer in zip(self.projections, self.convolutions, self.attention, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x, incremental_state) x = F.glu(x, dim=2) # attention if attention is not None: x = self._transpose_if_training(x, incremental_state) x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask) attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_if_training(x, incremental_state) # residual if residual is not None: x = (x + residual) * math.sqrt(self.normalization_constant) residuals.append(x) # T x B x C -> B x T x C x = self._transpose_if_training(x, incremental_state) # project back to size of vocabulary if not using adaptive softmax if self.fc2 is not None and self.fc3 is not None: x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" if self.adaptive_softmax is not None: assert sample is not None and 'target' in sample out = self.adaptive_softmax.get_log_prob(net_output[0], sample['target']) return out.exp_() if not log_probs else outCoverageAttentionLayer else: return super().get_normalized_probs(net_output, log_probs, sample) def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out') if encoder_out is not None: encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out) utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out) def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions() if self.embed_positions is not None else float('inf') def upgrade_state_dict(self, state_dict): if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2: # old models use incorrect weight norm dimension for i, conv in enumerate(self.convolutions): # reconfigure weight norm nn.utils.remove_weight_norm(conv) self.convolutions[i] = nn.utils.weight_norm(conv, dim=0) state_dict['decoder.version'] = torch.Tensor([1]) return state_dict def _embed_tokens(self, tokens, incremental_state): if incremental_state is not None: # keep only the last token for incremental forward pass tokens = tokens[:, -1:] return self.embed_tokens(tokens) def _split_encoder_out(self, encoder_out, incremental_state): """Split and transpose encoder outputs. This is cached when doing incremental inference. """ cached_result = utils.get_incremental_state(self, incremental_state, 'encoder_out') if cached_result is not None: return cached_result # transpose only once to speed up attention layers encoder_a, encoder_b = encoder_out encoder_a = encoder_a.transpose(1, 2).contiguous() result = (encoder_a, encoder_b) if incremental_state is not None: utils.set_incremental_state(self, incremental_state, 'encoder_out', result) return result def _transpose_if_training(self, x, incremental_state): if incremental_state is None: x = x.transpose(0, 1) return x
class transformer_with_copyDecoder(FairseqIncrementalDecoder): """ transformer_with_copy decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`transformer_with_copyDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ transformer_with_copyDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.copy_attention = MultiheadOnlyAttention( embed_dim, 1, dropout=0, ) self.copy_or_generate = nn.Sequential(nn.Linear(embed_dim, 1), nn.Sigmoid()) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) inner_states = [x] # decoder layers for layer in self.layers: x, _ = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) _, copy = self.copy_attention( query=x, key=encoder_out['encoder_out'] if encoder_out is not None else None, value=encoder_out['encoder_out'] if encoder_out is not None else None, key_padding_mask=encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state=incremental_state, static_kv=True, need_weights=True, ) copy_or_generate = self.copy_or_generate(x).transpose(0, 1) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, { 'attn': copy, 'inner_states': inner_states, 'copy_or_generate': copy_or_generate } def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" # print('enter normalized.') if 'net_input' in sample.keys(): enc_seq_ids = sample['net_input']['src_tokens'] else: enc_seq_ids = sample['src_tokens'] # wvocab_size = net_output[0].size(2) # batch_size = enc_seq_ids.size(0) # seq_len = enc_seq_ids.size(1) # one_hot = torch.zeros(batch_size, seq_len, wvocab_size).cuda().scatter_(dim=2, index=enc_seq_ids.unsqueeze(-1), value=1) # # copy_probs = torch.matmul(net_output[1]['attn'], one_hot) # final_dist = vocab_dist.scatter_add(1, encoder_batch_extend_vocab, attn_dist) if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] if log_probs: generate = utils.softmax( logits, dim=-1, onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate'] copy = net_output[1]['attn'] * (1 - net_output[1]['copy_or_generate']) enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat( 1, net_output[1]['attn'].size(1), 1) final = generate.scatter_add(2, enc_seq_ids, copy) final = torch.log(final + 1e-15) return final else: generate = utils.log_softmax( logits, dim=-1, onnx_trace=self.onnx_trace) * net_output[1]['copy_or_generate'] copy = net_output[1]['attn'] * (1 - net_output[1]['copy_or_generate']) enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat( 1, net_output[1]['attn'].size(1), 1) final = generate.scatter_add(2, enc_seq_ids, copy) return final def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict
class TransformerDecoder(nn.Module): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__() self.register_buffer('version', torch.Tensor([3])) self.dictionary = dictionary self.onnx_trace = False self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, features_only=False, **extra_args ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, **extra_args ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, full_context_alignment=False, alignment_layer=None, alignment_heads=None, **unused, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = len(self.layers) - 1 # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn = None inner_states = [x] for idx, layer in enumerate(self.layers): encoder_state = None if encoder_out is not None: if self.layer_wise_attention: encoder_state = encoder_out.encoder_states[idx] else: encoder_state = encoder_out.encoder_out if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if not self.training or (dropout_probability > self.decoder_layerdrop): x, layer_attn = layer( x, encoder_state, encoder_out.encoder_padding_mask if encoder_out is not None else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=(idx == alignment_layer), need_head_weights=(idx == alignment_layer), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float() if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) else: return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict(self, state_dict): """Upgrade a (possibly old) state dict for new versions of fairseq.""" return state_dict def prepare_for_onnx_export_(self): self.onnx_trace = True def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k] del state_dict[k] version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict def reorder_incremental_state(self, incremental_state, new_order): """Reorder incremental state. This should be called when the order of the input has changed from the previous time step. A typical use case is beam search, where the input order changes between time steps based on the selection of beams. """ seen = set() def apply_reorder_incremental_state(module): if module != self and hasattr(module, 'reorder_incremental_state') \ and module not in seen: seen.add(module) module.reorder_incremental_state(incremental_state, new_order) self.apply(apply_reorder_incremental_state) def set_beam_size(self, beam_size): """Sets the beam size in the decoder and all children.""" if getattr(self, '_beam_size', -1) != beam_size: seen = set() def apply_set_beam_size(module): if module != self and hasattr(module, 'set_beam_size') \ and module not in seen: seen.add(module) module.set_beam_size(beam_size) self.apply(apply_set_beam_size) self._beam_size = beam_size
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.onnx_trace = False self.decoder_max_order = args.decoder_max_order self.clamp_value = getattr(args, 'clamp_value', 0.01) self.gs_clamp = args.gs_clamp def set_perm_order(self, perm_order=0): assert isinstance(perm_order, int) and 0 <= perm_order <= 5 for layer in self.layers: layer.set_perm_order(perm_order) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features(prev_output_tokens, encoder_out, incremental_state) x = self.output_layer(x, encoder_out) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, encoder_out, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return [ F.linear(features, self.embed_tokens.weight), encoder_out['encoder_pred_order'] ] else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict def get_normalized_probs(self, net_output, log_probs, sample, gs_tau=0.5, gs_hard=False): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0][0] orders = net_output[0][1] if log_probs: return (utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) else: return (utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) def gumbel_softmax(self, logits, gs_tau=0.5, gs_hard=False, dim=-1): if not gs_hard: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) logprob = torch.log(prob_clamp if self.gs_clamp else prob) gs = F.gumbel_softmax( logprob, tau=gs_tau, hard=False, ) else: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) max_idx = torch.argmax(logits, -1, keepdim=True) one_hot = logits.new_zeros(logits.size()) gs = one_hot.scatter(-1, max_idx, 1) return gs, prob, prob_clamp
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim # calculate copy probability p(z=1) batch self.copy = args.copy self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) if self.copy: self.copy_attn = MultiheadAttention( embed_dim, 1, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.linear_copy = Linear(embed_dim, 1) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features(prev_output_tokens, encoder_out, incremental_state) x = self.output_layer(x) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) copy_x, copy_attn = None, None if self.copy: copy_x, copy_attn = self.copy_attn( query=x, key=encoder_out['encoder_out'] if encoder_out is not None else None, value=encoder_out['encoder_out'] if encoder_out is not None else None, key_padding_mask=encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state=incremental_state, static_kv=True, need_weights=True, ) # copy_x = copy_x.transpose(0, 1) p_copy = None if self.copy: # p_copy = torch.sigmoid(self.linear_copy(copy_attn)) p_copy = torch.sigmoid(self.linear_copy(x)).transpose(0, 1) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) # return x, {'attn': attn, 'inner_states': inner_states, 'p_copy': p_copy} return x, { 'attn': attn, 'inner_states': inner_states, 'p_copy': p_copy, 'copy_attn': copy_attn } def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def get_normalized_probs(self, net_output, log_probs, sample): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] is_copy = 'p_copy' in net_output[1].keys( ) and net_output[1]['p_copy'] is not None # print(net_output[1]['attn']) if is_copy and False: p_copy = net_output[1]['p_copy'] if 'net_input' in sample.keys(): enc_seq_ids = sample['net_input']['src_tokens'] else: # for decode step enc_seq_ids = sample['src_tokens'] enc_seq_ids = enc_seq_ids.unsqueeze(1).repeat( 1, net_output[1]['copy_attn'].size(1), 1) generate_prob = utils.softmax( logits, dim=-1, onnx_trace=self.onnx_trace) * (1 - p_copy) copy_prob = net_output[1]['copy_attn'] * p_copy final = generate_prob.scatter_add(2, enc_seq_ids, copy_prob) if log_probs: return torch.log(final + 1e-15) else: return final else: if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) else: return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size( 0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict