def __init__( self, dictionary, embed_dim=512, transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG, conv_config=DEFAULT_DEC_CONV_CONFIG, encoder_output_dim=512, ): super().__init__(dictionary) vocab_size = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(vocab_size, embed_dim, self.padding_idx) self.conv_layers = nn.ModuleList() if conv_config is not None: for i in range(len(conv_config)): out_channels, kernel_size, layer_norm = conv_config[i] if i == 0: conv_layer = LinearizedConv1d(embed_dim, out_channels, kernel_size, padding=kernel_size - 1) else: conv_layer = LinearizedConv1d( conv_config[i - 1][0], out_channels, kernel_size, padding=kernel_size - 1, ) self.conv_layers.append(conv_layer) if layer_norm: self.conv_layers.append(nn.LayerNorm(out_channels)) self.conv_layers.append(nn.ReLU()) self.layers = nn.ModuleList() if conv_config is not None: if conv_config[-1][0] != transformer_config[0][0]: self.layers.append( Linear(conv_config[-1][0], transformer_config[0][0])) self.layers.append( TransformerDecoderLayer( prepare_transformer_decoder_params(*transformer_config[0]))) for i in range(1, len(transformer_config)): if transformer_config[i - 1][0] != transformer_config[i][0]: self.layers.append( Linear(transformer_config[i - 1][0], transformer_config[i][0])) self.layers.append( TransformerDecoderLayer( prepare_transformer_decoder_params( *transformer_config[i]))) self.fc_out = Linear(transformer_config[-1][0], vocab_size)
def build_decoder_layer(self, args, no_encoder_attn=False): decoder_embed_dim = args.decoder_embed_dim args.decoder_embed_dim = (decoder_embed_dim + self.lang_embed_dim + args.encoder_embed_dim) res = TransformerDecoderLayer(args, no_encoder_attn=True) args.decoder_embed_dim = decoder_embed_dim return res
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.dropout = args.decoder_dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim args.encoder_embed_dim = embed_dim self.layerdrop = args.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) args = copy.deepcopy(args) args.dropout = args.decoder_dropout args.attention_dropout = args.decoder_attention_dropout args.activation_dropout = args.decoder_activation_dropout self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.scale_norm: scale = embed_dim**0.5 else: scale = None if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim, scale=scale) else: self.layer_norm = None
def build_decoder_layer(self, args, i=None, no_encoder_attn=False): modular_layer_indices = eval(args.decoder_modular_layer_indices) if type(modular_layer_indices) is int: modular_layer_indices = [modular_layer_indices] if i in modular_layer_indices: return TransformerModularDecoderLayer(args, no_encoder_attn) else: return TransformerDecoderLayer(args, no_encoder_attn)
def build_decoder_layer(self, args, no_encoder_attn=False): layer = TransformerDecoderLayer(args, no_encoder_attn) checkpoint = getattr(args, "checkpoint_activations", False) if checkpoint: offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = ( getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP) if not checkpoint else 0 ) layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn) self.dictionary = dictionary self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False) self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None) self.embed_word_del = Embedding(2, self.output_embed_dim, None) # del_word, ins_mask, ins_word self.early_exit = [int(i) for i in args.early_exit.split(",")] assert len(self.early_exit) == 3 # copy layers for mask-predict/deletion self.layers_msk = None if getattr(args, "no_share_maskpredictor", False): self.layers_msk = nn.ModuleList([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(self.early_exit[1]) ]) self.layers_del = None if getattr(args, "no_share_discriminator", False): self.layers_del = nn.ModuleList([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(self.early_exit[0]) ]) if getattr(args, "share_discriminator_maskpredictor", False): assert getattr(args, "no_share_discriminator", False), "must set saperate discriminator" self.layers_msk = self.layers_del
def __init__(self, args, src_dict, padding_idx=1): super().__init__(None) self._future_mask = torch.empty(0) self.args = args self.padding_idx = src_dict.pad() if src_dict else padding_idx self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.embed_positions = PositionalEmbedding( args.max_target_positions, args.decoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.prenet = nn.Sequential( Prenet( self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout ), nn.Linear(args.prenet_dim, args.decoder_embed_dim), ) self.n_transformer_layers = args.decoder_transformer_layers self.transformer_layers = nn.ModuleList( TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers) ) if args.decoder_normalize_before: self.layer_norm = LayerNorm(args.decoder_embed_dim) else: self.layer_norm = None self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) self.eos_proj = nn.Linear(args.decoder_embed_dim, 1) self.postnet = Postnet( self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout, ) self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) self.apply(decoder_init)
def build_decoder_layer(self, args, no_encoder_attn=False): layer = TransformerDecoderLayer(args, no_encoder_attn) if getattr(args, "checkpoint_activations", False): layer = checkpoint_wrapper(layer) return layer
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, src_dict=None): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.src_dict = src_dict self.alignment_layer = getattr(args, 'alignment_layer', 2) if hasattr(args, "alignment_task") and args.alignment_task == 'supalign': self.add_sup_align_module = True else: self.add_sup_align_module = False self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer( args, no_encoder_attn, add_suphead=(self.add_sup_align_module and idx == self.alignment_layer)) for idx in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) if args.PRUNE_BOOL: self.decoder_self_attn_path = args.DECODER_SELF_ATTN_PATH self.encoder_decoder_attn_path = args.ENCODER_DECODER_ATTN_PATH self.decoder_self_attn_pattern = torch.from_numpy(np.load(self.decoder_self_attn_path)) #(no_layers, 1, no_head, 1024, 1024) self.encoder_decoder_attn_pattern = torch.from_numpy(np.load(self.encoder_decoder_attn_path)) #(no_layers, 1, no_head, 1024, 1024) if args.CUDA: self.decoder_self_attn_pattern = self.decoder_self_attn_pattern.cuda() self.encoder_decoder_attn_pattern = self.encoder_decoder_attn_pattern.cuda() self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) if args.PRUNE_BOOL: self.layers.extend( [ TransformerDecoderLayer(args, self.decoder_self_attn_pattern[i], self.encoder_decoder_attn_pattern[i], no_encoder_attn) for i in range(args.decoder_layers) ] ) else: self.layers.extend( [ TransformerDecoderLayer(args, None, None, no_encoder_attn) for i in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.tgt_drop = args.tgt_drop self.drop_method = args.drop_method if self.drop_method == 'drop_tag': self.mask = dictionary.indices['<dropped>'] elif self.drop_method == 'unk_tag': self.mask = dictionary.indices['<unk>']
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) # self.dropout = [0.05, 0.1, 0.25, 0.3] self.dropout = [0, 0, 0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3] # self.dropout = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3] self.index = None self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) # self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim], # [embed_dim, embed_dim, embed_dim, embed_dim]) # self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim], # [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim]) self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim], [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim], [embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim]) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.ffn_net_src = nn.Sequential( Linear(args.bert_hidden_size, args.ffn_net_hidden_size), nn.ReLU(), Linear(args.ffn_net_hidden_size, 512)) self.ffn_net_cand = nn.Sequential( Linear(args.bert_hidden_size, args.ffn_net_hidden_size), nn.ReLU(), Linear(args.ffn_net_hidden_size, 512)) self.ffn_net_ce = nn.Sequential( Linear(args.bert_hidden_size, args.ffn_net_hidden_size), nn.ReLU(), Linear(args.ffn_net_hidden_size, args.bert_vocab_size))
def build_decoder_layer(self, args, no_encoder_attn=False): return TransformerDecoderLayer(args, no_encoder_attn)
def __init__( self, cfg: Wav2Vec2Seq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( cfg.max_target_positions, embed_dim, padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ]) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim else None if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None