def __init__(self, args, embed_tokens, dictionary): super().__init__() self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_tokens = embed_tokens self.output_embed_dim = args.decoder_output_dim embed_dim = args.decoder_embed_dim self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: assert not isinstance(embed_tokens, nn.ModuleList) self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_tokens = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_tokens, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def build_output_projection(self, args, dictionary, embed_tokens): if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 ) num_base_layers = getattr(args, "base_layers", 0) for i in range(num_base_layers): self.layers.insert(((i+1) * args.decoder_layers) // (num_base_layers + 1), BaseLayer(args))
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size self.layers = nn.ModuleList([ LSTMCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.fix_emb = args.fix_dec_emb self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
def __init__(self, args, max_batch, dictionary, embed_tokens, DecoderLayers, layer_norm): self.layers = DecoderLayers self.layer_norm = layer_norm self.cross_self_attention = False self.adaptive_softmax = None self.share_input_output_embed = args.share_decoder_input_output_embed self.output_embed_dim = args.decoder_output_dim self.embed_tokens = embed_tokens self.pre_padding_len = torch.empty(0) self.positions = torch.zeros(1).long().cuda() self.pre_padding_len = None self.max_target_positions = args.max_target_positions if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units assert encoder_output_units == hidden_size, \ 'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size) # TODO another Linear layer if not equal self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention = AttentionLayer(encoder_output_units, hidden_size) if attention else None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) self.actor = Actor(hidden_size, hidden_size)
def __init__(self, args, dictionary, embed_tokens, embed_scale=None, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim self.embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(self.embed_dim) if embed_scale is None else embed_scale self.project_in_dim = nn.Linear(input_embed_dim, self.embed_dim, bias=False) if self.embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, #learned=args.decoder_learned_pos, ) if not args.no_dec_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = nn.Linear(self.embed_dim, output_embed_dim, bias=False) \ if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) #nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = BertLayerNorm(self.embed_dim) if not self.share_input_output_embed: self.embed_out.requires_grad = False else: self.embed_tokens.requires_grad = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.mytask = args.task self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.copy_attention = args.copy_attention self.attention_dropout = args.attention_dropout if self.copy_attention: self.copy_attn_layer = MultiheadAttention( embed_dim, args.copy_attention_heads, dropout=args.copy_attention_dropout) self.copy_alpha_linear = nn.Linear(embed_dim, 1)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.scale_norm: scale = embed_dim**0.5 else: scale = None if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim, scale=scale) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.mask_future = not args.attn_use_future self.mask_self = not args.attn_use_self input_embed_dim = args.decoder_embed_dim embed_dim = args.decoder_embed_dim self.no_encoder_attn = no_encoder_attn self.output_embed_dim = args.decoder_output_dim self.max_target_positions = args.max_target_positions self.embed_scale = math.sqrt(embed_dim) self.bridge = LengthPredictorBridge(args, dictionary=dictionary, max_offset=args.bridge_max_offset) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_tokens = None self.decoder_layers = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not args.share_decoder_input_output_embed: self.embed_out = OutLinear(self.output_embed_dim, len(dictionary), bias=False, out_norm=args.out_norm) else: self.embed_tokens = embed_tokens self.register_buffer('version', torch.Tensor([2])) # append for basic non-auto self.use_enc_last = args.use_enc_last self._build_inner_layers(args)
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, encoder_output_units=0, attn_type=None, attn_dim=0, need_attn=False, residual=False, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed if attn_type is None or attn_type.lower() == 'none': # no attention, no encoder output needed (language model case) need_attn = False encoder_output_units = 0 self.need_attn = need_attn self.residual = residual self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + (embed_dim if layer == 0 else hidden_size), hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attn_type is None or attn_type.lower() == 'none': self.attention = None elif attn_type.lower() == 'bahdanau': self.attention = speech_attention.BahdanauAttention( hidden_size, encoder_output_units, attn_dim, ) elif attn_type.lower() == 'luong': self.attention = speech_attention.LuongAttention( hidden_size, encoder_output_units, ) else: raise ValueError('unrecognized attention type.') if hidden_size + encoder_output_units != out_embed_dim: self.additional_fc = Linear(hidden_size + encoder_output_units, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, dictionary: Dictionary, embed_dim: int = 512, hidden_size: int = 512, out_embed_dim: int = 512, num_layers: int = 1, dropout_in: float = 0.1, dropout_out: float = 0.1, attention: bool = True, encoder_embed_dim: int = 512, encoder_output_units: int = 512, pretrained_embed: Optional[nn.Embedding] = None, share_input_output_embed: bool = False, adaptive_softmax_cutoff: Optional[int] = None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=hidden_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size) if attention else None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, embed_tokens, no_encoder_attn=False, left_pad=False): super().__init__() self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.fuse_dropout_add = args.fuse_dropout_add self.fuse_relu_dropout = args.fuse_relu_dropout embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( args.tgt_vocab_size, args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(args.tgt_vocab_size, embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) else: self.embed_out = self.embed_tokens.weight self.normalize = args.decoder_normalize_before if self.normalize: self.layer_norm = FusedLayerNorm( embed_dim) if args.fuse_layer_norm else nn.LayerNorm(embed_dim)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None self.layers = nn.ModuleList([ LSTMCell( input_size=hidden_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False): super().__init__(dictionary) self.dropout = nn.Dropout(p=args.dropout) self.share_input_output_embed = args.share_decoder_input_output_embed self.fuse_dropout_add = args.fuse_dropout_add self.fuse_relu_dropout = args.fuse_relu_dropout embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = Scale(embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout) else: self.embed_out = nn.Linear(embed_dim, len(dictionary), bias=False) nn.init.normal_(self.embed_out.weight, mean=0, std=embed_dim**-0.5) self.normalize = args.decoder_normalize_before if self.normalize: self.layer_norm = FusedLayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5)
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True ): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvDecoderLayer( args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i] ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, encoder_output_units=0, attn_type=None, attn_dim=0, need_attn=False, residual=False, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, scheduled_sampling_rate_scheduler=None, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out * 1.0, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed if attn_type is None or str(attn_type).lower() == "none": # no attention, no encoder output needed (language model case) need_attn = False encoder_output_units = 0 self.need_attn = need_attn self.residual = residual self.max_target_positions = max_target_positions self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList( [ LSTMCell( input_size=encoder_output_units + (embed_dim if layer == 0 else hidden_size), hidden_size=hidden_size, ) for layer in range(num_layers) ] ) if attn_type is None or str(attn_type).lower() == "none": self.attention = None elif str(attn_type).lower() == "bahdanau": self.attention = speech_attention.BahdanauAttention( hidden_size, encoder_output_units, attn_dim, ) elif str(attn_type).lower() == "luong": self.attention = speech_attention.LuongAttention( hidden_size, encoder_output_units, ) else: raise ValueError("unrecognized attention type.") if hidden_size + encoder_output_units != out_embed_dim: self.additional_fc = Linear( hidden_size + encoder_output_units, out_embed_dim ) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
def __init__( self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3), ) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, ): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) self.dropout = dropout self.need_attn = True convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError( 'Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, ) if positional_embeddings else None self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for i, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) self.convolutions.append( LinearizedConv1d(in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout)) self.attention.append( AttentionLayer(out_channels, embed_dim ) if attention[i] else None) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.adaptive_softmax = None self.fc2 = self.fc3 = None if adaptive_softmax_cutoff is not None: assert not share_embed self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, in_channels, adaptive_softmax_cutoff, dropout=adaptive_softmax_dropout) else: self.fc2 = Linear(in_channels, out_embed_dim) if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.fc3 = nn.Linear(out_embed_dim, num_embeddings) self.fc3.weight = self.embed_tokens.weight else: self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.tgt_drop = args.tgt_drop self.drop_method = args.drop_method if self.drop_method == 'drop_tag': self.mask = dictionary.indices['<dropped>'] elif self.drop_method == 'unk_tag': self.mask = dictionary.indices['<unk>']
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(args, dictionary, embed_tokens) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr(args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 )
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) """ MODIFIED: add copying mechanism as a separate multi-head attention """ if args.use_copy_scores: assert not no_encoder_attn, \ "copy scores cannot be computed if " \ "there is no encoder-decoder attention" # Number of heads in copy attention layer is an optional argument self.copy_attention_heads = (args.decoder_attention_heads if args.copy_attention_heads == 0 else args.copy_attention_heads) self.copy_attention = MultiheadAttention( embed_dim, self.copy_attention_heads, dropout=args.attention_dropout, ) # (NOTE) For computing alpha. self.copy_balancing_layer = Linear(input_embed_dim, 1) if args.decode_with_edit_labels: raise NotImplementedError else: self.copy_attention = None self.copy_balancing_layer = None # Alpha scheduler & diagnostic checker self.alpha_warmup = args.alpha_warmup self.num_batches = 0 self.num_copies = 0 self.mean_alpha = 0.0 # Zero out generative probability of a word if also in source sentence self.pad_copied_words = args.pad_copied_words self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: """ MODIFIED: require share_input_output_embed for copying mechanism """ raise NotImplementedError( "copying mechanism requires share_input_output_embed" ) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, src_dict=None): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.src_dict = src_dict self.alignment_layer = getattr(args, 'alignment_layer', 2) if hasattr(args, "alignment_task") and args.alignment_task == 'supalign': self.add_sup_align_module = True else: self.add_sup_align_module = False self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer( args, no_encoder_attn, add_suphead=(self.add_sup_align_module and idx == self.alignment_layer)) for idx in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) if args.PRUNE_BOOL: self.decoder_self_attn_path = args.DECODER_SELF_ATTN_PATH self.encoder_decoder_attn_path = args.ENCODER_DECODER_ATTN_PATH self.decoder_self_attn_pattern = torch.from_numpy(np.load(self.decoder_self_attn_path)) #(no_layers, 1, no_head, 1024, 1024) self.encoder_decoder_attn_pattern = torch.from_numpy(np.load(self.encoder_decoder_attn_path)) #(no_layers, 1, no_head, 1024, 1024) if args.CUDA: self.decoder_self_attn_pattern = self.decoder_self_attn_pattern.cuda() self.encoder_decoder_attn_pattern = self.encoder_decoder_attn_pattern.cuda() self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) if args.PRUNE_BOOL: self.layers.extend( [ TransformerDecoderLayer(args, self.decoder_self_attn_pattern[i], self.encoder_decoder_attn_pattern[i], no_encoder_attn) for i in range(args.decoder_layers) ] ) else: self.layers.extend( [ TransformerDecoderLayer(args, None, None, no_encoder_attn) for i in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim model_dim = args.decoder_model_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False, uniform=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.tgt_embed = nn.Parameter(nn.init.normal_( torch.Tensor(embed_dim), mean=0, std=embed_dim ** -0.5 )) if args.src_tgt_embed else None self.input_layer = ReformerInputLayer(args) self.layers = nn.ModuleList([]) self.layers.extend([ ReformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(model_dim, output_embed_dim, bias=False, uniform=False) if model_dim != output_embed_dim else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(model_dim) # output_layer function to compress model output before softmax # Target x Source x Batch x Channel -> T x B x C self.output_layer = ReformerOutputLayer(args)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False, uniform=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.decoder_mix_layers = getattr(args, "decoder_mix_layers", 3) self.layers = nn.ModuleList([]) self.layers.extend([ MANDecoderLayer(args, no_encoder_attn, mix=True, no_man=args.no_man_decoder) ]) self.adaptive_softmax = None self.project_out_dim = Linear( embed_dim, output_embed_dim, bias=False, uniform=False) if embed_dim != output_embed_dim else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.dropword_module = FairseqFeatureDropout(args.word_dropout, module_name=self.__class__.__name__) self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) assert not args.layernorm_embedding or not args.decoder_normalize_before if args.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([self.build_decoder_layer(i, args) for i in range(args.decoder_layers)]) self.num_layers = len(self.layers) if args.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) self.proj_layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.proj_layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 )
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.unk_idx = dictionary.unk() self.eos_idx = dictionary.eos() self.max_target_positions = args.max_target_positions self.output_dim = args.decoder_embed_dim self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.input_dropout = torch.tensor( args.input_dropout) if args.input_dropout > 0 else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) if not args.single_tower else self.forward_layers self.single_tower = args.single_tower self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(embed_dim * 2, embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) else: self.share_input_output_embed = False
def __init__(self, args, dictionary, embed_tokens, classification_head=None): super().__init__(dictionary) self.onnx_trace = False self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.char_inputs = args.char_inputs self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(self.embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(self.embed_dim * 2, self.embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None self.classification_head = classification_head if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.embed_dim**-0.5)