def __init__(self, audio_encoder, video_encoder, decoder): super().__init__() self.audio_encoder = audio_encoder self.video_encoder = video_encoder # embeded_dim # num_head # kdim # vdim # dropout self.av_attn = MultiheadAttention( 512, 8, 512, 512, 0.15, encoder_decoder_attention=True ) self.decoder = decoder
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim if args.max_relative_length == -1: self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) else: self.self_attn = RelativeMultiheadAttention( self.embed_dim, args.encoder_attention_heads, args.max_relative_length, dropout=args.attention_dropout, k_only=args.k_only, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(2)])
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.fuse_dropout_add = args.fuse_dropout_add self.fuse_relu_dropout = args.fuse_relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.maybe_ln1 = MaybeLayerNorm(self.embed_dim, self.normalize_before, fuse=args.fuse_layer_norm) self.maybe_ln2 = MaybeLayerNorm(self.embed_dim, self.normalize_before, fuse=args.fuse_layer_norm)
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before ''' self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) ''' self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList( [LayerNorm(self.embed_dim) for i in range(3)])
def __init__( self, embed_dim, attention_heads, self_attention=True, attention_dropout=0.1, dropout=0.3, normalize_before=False, # activation_fn='relu', activation_dropout=0 ): super().__init__() self.embed_dim = embed_dim self.self_attn = MultiheadAttention(self.embed_dim, attention_heads, dropout=attention_dropout, self_attention=self_attention) self.attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = dropout # self.activation_fn activation_fn # self.activation_dropout = activation_dropout self.normalize_before = normalize_before
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__(args, no_encoder_attn=no_encoder_attn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn) self.add_context = args.context_position in ["both", "decoder"] self.context_attention_type = args.context_decoder_attention_type if self.add_context: self.context_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.context_gating_wi = Linear(self.embed_dim, self.embed_dim) self.context_gating_ws = Linear(self.embed_dim, self.embed_dim) self.context_attn_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args, dictionary, embed_tokens): encoder_layers = args.encoder_layers args.encoder_layers = 0 super().__init__(args, dictionary, embed_tokens) self.layers = nn.ModuleList([]) self.layers.extend( [EndorsementDetectorEncoderLayer(args) for i in range(1)]) args.encoder_layers = encoder_layers self.eds_fc1 = Linear(args.encoder_embed_dim, args.encoder_embed_dim, bias=False) self.eds_fc2 = Linear(args.encoder_embed_dim, args.encoder_embed_dim, bias=False) self.eds_layer_norm = LayerNorm(args.encoder_embed_dim) self.self_attn = MultiheadAttention(args.encoder_embed_dim, args.encoder_attention_heads, dropout=0, self_attention=True)
def build_self_attention(self, embed_dim, args, add_bias_kv=False, add_zero_attn=False): return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, positional_embeddings_in_attention=getattr( args, "positional_embeddings_in_attention", False), symmetric_kv_context_params=getattr( args, "decoder_self_symmetric_kv_context_params", False), symmetric_kv_positional_params=getattr( args, "decoder_self_symmetric_kv_positional_params", False), )
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) n_layernorm = 2 self.fc_factor = 1.0 self.macaron = getattr(args, "macaron", False) if self.macaron: self.macaron_fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.macaron_fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.fc_factor = 0.5 n_layernorm += 1 self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(n_layernorm)])
def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: float = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = 'relu', attn_scale_factor: int = 1, export: bool = False, # new added encoder_normalize_before: bool = False, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, bias=True, scale_factor=attn_scale_factor, ) # new added self.normalize_before = encoder_normalize_before # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
def __init__(self, args, layer_id=-1): super().__init__() self.embed_dim = args.encoder_embed_dim # beg 20191115 multi-hop attention configuration in layer self.layer_id = layer_id self.attn_type = args.encoder_attn_type # set as an class attribute considering to use the value in forward method self.spec_layers = [ int(i) for i in args.encoder_spec_attn_layers.split(',') if i != '' ] if self.attn_type == 'MHDA' and self.layer_id in self.spec_layers: self.self_attn = MultiHopDependentAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True) print('Self Attention [@Encoder Layer-{}] is MHDA.'.format( self.layer_id)) else: self.self_attn = MultiheadAttention(self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True) print( 'Self Attention [@Encoder Layer-{}] is vanilla multi-head attention.' .format(self.layer_id)) # end 20191115 self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: float = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", layer_norm_first: bool = False, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, ) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(self.activation_dropout) self.dropout3 = nn.Dropout(dropout) self.layer_norm_first = layer_norm_first # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.context_size = args.context_size embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.conv_layer_norm = LayerNorm(embed_dim) self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(self.context_size, ), padding=(self.context_size - 1) // 2) self.self_attn = MultiheadAttention(embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True)
def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: float = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = 'relu', add_bias_kv: bool = False, add_zero_attn: bool = False, export: bool = False, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=True ) # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
def build_self_attention(self, embed_dim, args): return MultiheadAttention( embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, sigsoftmax=args.sigsoftmax, mix_softmax=args.mix_softmax, mix_type=args.mix_type, temperature=args.temperature, pre_drop_mix=args.pre_drop_mix, pre_mix=args.pre_mix, fix_head_dim=args.fix_head_dim, use_div_reg=args.use_div_reg, synth_attn_type=args.synth_attn_type, synth_hidden_dim=args.synth_hidden_dim, synth_factor_dim=args.synth_factor_dim, synth_trainable_random=args.synth_trainable_random, synth_max_len_seq=args.tokens_per_sample, )
def build_self_attention(self, embed_dim, args, add_bias_kv=False, add_zero_attn=False): collaborative_heads = "decoder" in args.collaborative_heads key_dim = args.key_dim or embed_dim if collaborative_heads: print(f"Decoder uses collaborative heads with key_dim={key_dim}.") return MultiheadAttention( embed_dim, args.decoder_attention_heads, collaborative_heads=collaborative_heads, kdim=key_dim, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention= False, # not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, )
def __init__(self, args, index): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu') ) if args.encoder_branch_type is None: self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) else: layers = [] embed_dims = [] heads = [] num_types = len(args.encoder_branch_type) for layer_type in args.encoder_branch_type: embed_dims.append(int(layer_type.split(':')[2])) heads.append(int(layer_type.split(':')[3])) layers.append(self.get_layer(args, index, embed_dims[-1], heads[-1], layer_type)) assert sum(embed_dims) == self.embed_dim, (sum(embed_dims), self.embed_dim) self.self_attn = MultiBranch(layers, embed_dims) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim, init=args.ffn_init) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim, init=args.ffn_init) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.context_size = args.context_size self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu') ) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) # define the three convolutional layers with different context sizes # the convolutions are implemented as separable convolutions for reducing the number of model parameters self.conv1 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size, ), padding=(self.context_size-1)//2) self.conv1_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1) self.conv2 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size+2, ), padding=(self.context_size+2-1)//2) self.conv2_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1) self.conv3 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size+4, ), padding=(self.context_size+4-1)//2) self.conv3_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1) self.leakyrelu1 = nn.LeakyReLU(negative_slope=0.01, inplace=True) self.leakyrelu2 = nn.LeakyReLU(negative_slope=0.01, inplace=True) self.leakyrelu3 = nn.LeakyReLU(negative_slope=0.01, inplace=True) self.leakyrelu4 = nn.LeakyReLU(negative_slope=0.01, inplace=True) # conv4 here is a standard convolution layer, for reshape the concatenation self.conv4 = nn.Conv1d(in_channels=self.embed_dim*3, out_channels=self.embed_dim, kernel_size=(self.context_size, ), padding=(self.context_size-1)//2) self.conv_layer_norm = LayerNorm(self.embed_dim)
def build_self_attention(self, embed_dim, args, index, add_bias_kv=False, add_zero_attn=False): if args.decoder_branch_type is None: return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) else: embed_dims = [] for layer_type in args.decoder_branch_type: embed_dims.append(int(layer_type.split(':')[2])) return MultiBranch(self.self_attn_branches, embed_dims)
def build_self_attention(self, embed_dim, args, add_bias_kv=False, add_zero_attn=False): return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, # relative_pos_type=("masked" # if getattr(args, "use_relative_pos_embeddings", False) # else None), relative_pos_type=None, ### max_relative_pos=getattr(args, "max_relative_pos", 128), heads_share_embeddings=getattr(args, "heads_share_embeddings", False), add_pos_embeddings_to_values=getattr( args, "add_pos_embeddings_to_values", False))
def get_layer(self, args, index, out_dim, num_heads, layer_type, add_bias_kv, add_zero_attn): kernel_size = layer_type.split(':')[1] if kernel_size == 'default': kernel_size = args.decoder_kernel_size_list[index] else: kernel_size = int(kernel_size) layer_type = layer_type.split(':')[0] if layer_type == 'lightweight': layer = LightweightConv(out_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif layer_type == 'dynamic': layer = DynamicConv(out_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif layer_type == 'attn': layer = MultiheadAttention( out_dim, num_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) else: raise NotImplementedError return layer
def __init__(self, args): super().__init__() embed_dim = args.encoder_embed_dim self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) # self.self_attn_layer_norm = SlimmableLayernorm([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3/ 4), self.embed_dim]) self.self_attn_layer_norm = SlimmableLayernorm([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.dropout = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") ) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) self.normalize_before = args.encoder_normalize_before # self.fc1 = SLinear([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4), self.embed_dim], # [int(args.encoder_ffn_embed_dim / 4), int(args.encoder_ffn_embed_dim * 2 / 4), int(args.encoder_ffn_embed_dim * 3 / 4), args.encoder_ffn_embed_dim]) # self.fc2 = SLinear([int(args.encoder_ffn_embed_dim / 4), int(args.encoder_ffn_embed_dim * 2 / 4), int(args.encoder_ffn_embed_dim * 3 / 4), args.encoder_ffn_embed_dim], # [int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4) , self.embed_dim]) # self.final_layer_norm = SlimmableLayernorm([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4), self.embed_dim]) self.fc1 = SLinear(embed_dim, args.encoder_ffn_embed_dim) self.fc2 = SLinear(args.encoder_ffn_embed_dim, embed_dim) self.final_layer_norm = SlimmableLayernorm([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.linear_list = [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim] self.ffn_list = [int(args.encoder_ffn_embed_dim * 4 / 16), int(args.encoder_ffn_embed_dim * 5 / 16), int(args.encoder_ffn_embed_dim * 6 / 16), int(args.encoder_ffn_embed_dim * 7 / 16), int(args.encoder_ffn_embed_dim * 8 / 16), int(args.encoder_ffn_embed_dim * 9 / 16), int(args.encoder_ffn_embed_dim * 10 / 16), int(args.encoder_ffn_embed_dim * 11 / 16), int(args.encoder_ffn_embed_dim * 12 / 16), int(args.encoder_ffn_embed_dim * 13 / 16), int(args.encoder_ffn_embed_dim * 14 / 16), int(args.encoder_ffn_embed_dim * 15 / 16), args.encoder_ffn_embed_dim] self.epison = True self.epison_value = 3.0
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.num_branches = args.encoder_branches self.num_pffn_branches = args.encoder_pffn_branches self.join_pffn = args.join_pffn self.branch_dropout = args.branch_dropout self.pffn_branch_dropout = args.pffn_branch_dropout self.enable_head_dropout = args.enable_head_dropout self.self_attn_branches = nn.ModuleList([ MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, head_dropout=self.branch_dropout if self.enable_head_dropout else None, ) for _ in range(self.num_branches) ]) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1_branches = nn.ModuleList([ Linear(self.embed_dim, args.encoder_ffn_embed_dim) for _ in range(self.num_pffn_branches) ]) self.fc2_branches = nn.ModuleList([ Linear(args.encoder_ffn_embed_dim, self.embed_dim) for _ in range(self.num_pffn_branches) ]) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: float = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, encoder_normalize_before: bool = False, use_bert_layer_norm: bool = False, use_gelu: bool = True, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout self.normalize_before = encoder_normalize_before # Initialize blocks self.activation_fn = gelu if use_gelu else F.relu self.self_attn = MultiheadAttention(self.embedding_dim, num_attention_heads, dropout=attention_dropout) # layer norm associated with the self attention layer self.self_attn_layer_norm = (BertLayerNorm(self.embedding_dim) if use_bert_layer_norm else LayerNorm( self.embedding_dim, eps=1e-12)) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = (BertLayerNorm(self.embedding_dim) if use_bert_layer_norm else LayerNorm( self.embedding_dim, eps=1e-12))
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): decoder_layers = args.decoder_layers args.decoder_layers = 0 super().__init__(args, dictionary, embed_tokens, no_encoder_attn) self.layers = nn.ModuleList([]) self.layers.extend([ EndorsementDetectorDecoderLayer(args, no_encoder_attn) for _ in range(1) ]) args.decoder_layers = decoder_layers self.padding_idx = embed_tokens.padding_idx self.proj_prob = Linear(args.decoder_embed_dim, 1, bias=True) self.eds_fc1 = Linear(args.decoder_embed_dim, args.decoder_embed_dim, bias=False) self.eds_fc2 = Linear(args.decoder_embed_dim, args.decoder_embed_dim, bias=False) self.eds_layer_norm = LayerNorm(args.decoder_embed_dim) self.self_attn = MultiheadAttention(args.decoder_embed_dim, args.decoder_attention_heads, dropout=0, self_attention=True)
class TransformerDecoderLayer(nn.Module): """Decoder layer block. In the original paper each operation (multi-head attention, encoder attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.decoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, no_encoder_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def forward(self, x, encoder_out, encoder_padding_mask, incremental_state, prev_self_attn_state=None, prev_attn_state=None, self_attn_mask=None, self_attn_padding_mask=None): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) if prev_self_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_self_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.self_attn._set_input_buffer(incremental_state, saved_state) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) attn = None if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) if prev_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) if self.onnx_trace: saved_state = self.self_attn._get_input_buffer(incremental_state) self_attn_state = saved_state["prev_key"], saved_state[ "prev_value"] return x, attn, self_attn_state return x, attn def maybe_layer_norm(self, layer_norm, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return layer_norm(x) else: return x def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn
def forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, attn_mask: Optional[Tensor] = None, before_softmax: bool = False, need_head_weights: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: key_padding_mask (ByteTensor, optional): mask to exclude keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. need_weights (bool, optional): return the attention weights, averaged over heads (default: False). attn_mask (ByteTensor, optional): typically used to implement causal attention, where the mask prevents the attention from looking forward in time (default: None). before_softmax (bool, optional): return the raw attention weights and values before the attention softmax. need_head_weights (bool, optional): return the attention weights for each head. Implies *need_weights*. Default: return the average attention weights over all heads. """ if need_head_weights: need_weights = True is_tpu = query.device.type == "xla" tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] if ( not self.onnx_trace and not is_tpu # don't use PyTorch version on TPUs and incremental_state is None and not static_kv # A workaround for quantization to work. Otherwise JIT compilation # treats bias in linear module as method. and not torch.jit.is_scripting() ): assert key is not None and value is not None return F.multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, torch.empty([0]), torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight, ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k = self.k_proj(query) v = self.v_proj(query) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], dim=1, ) q = ( q.contiguous() .view(tgt_len, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if k is not None: k = ( k.contiguous() .view(-1, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() .view(-1, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), static_kv=static_kv, ) saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None src_len = k.size(1) # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: assert v is not None src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as( key_padding_mask ), ], dim=1, ) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if not is_tpu: attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) else: attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) if before_softmax: return attn_weights, v attn_weights_float = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace ) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) assert v is not None attn = torch.bmm(attn_probs, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: attn_weights = attn_weights_float.view( bsz, self.num_heads, tgt_len, src_len ).transpose(1, 0) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) return attn, attn_weights
def __init__(self, args, no_encoder_attn=False, mix=False, no_man=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.mix = mix self.no_man = no_man self.decoder_fc1 = [] self.decoder_fc2 = [] self.decoder_self_attn = [] self.decoder_d_self_attn = [] self.decoder_ma = [] self.normalize = [] self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before index = 0 cnt = 0 self.decoder_seq = args.decoder_seq #print('args.decoder_seq', args.decoder_seq) for i in range(len(args.decoder_seq)): #ff if args.decoder_seq[i] == 1: t = Linear(self.embed_dim, args.decoder_para[i] * self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.decoder_fc1.append(t) t = Linear(args.decoder_para[i] * self.embed_dim, self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.decoder_fc2.append(t) t = LayerNorm(self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.normalize.append(t) #sa elif args.decoder_seq[i] == 2: t = MultiheadAttention( self.embed_dim, args.decoder_para[i], dropout=args.attention_dropout, ) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.decoder_self_attn.append(t) t = LayerNorm(self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.normalize.append(t) #dsa elif args.decoder_seq[i] == 3: t = MultiheadAttention(self.embed_dim, args.decoder_para[i], dropout=args.attention_dropout, re_weight_m=1, max_len=args.max_target_positions) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.decoder_d_self_attn.append(t) t = LayerNorm(self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.normalize.append(t) #ma elif args.decoder_seq[i] == 4: if no_encoder_attn: self.decoder_ma.append([]) self.normalize.append([]) else: t = MultiheadAttention( self.embed_dim, args.decoder_para[i], dropout=args.attention_dropout, ) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.decoder_ma.append(t) t = LayerNorm(self.embed_dim) self.add_module('sublayer_%d' % cnt, t) cnt += 1 self.normalize.append(t) index += 1 """ for i in self.decoder_fc1: i.cuda() for i in self.decoder_fc2: i.cuda() for i in self.decoder_self_attn: i.cuda() for i in self.decoder_d_self_attn: i.cuda() for i in self.decoder_ma: i.cuda() for i in self.normalize: i.cuda() """ #if no_encoder_attn: # self.encoder_attn = None # self.encoder_attn_layer_norm = None self.need_attn = True self.onnx_trace = False
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True