Beispiel #1
0
    def __init__(self, audio_encoder, video_encoder, decoder):
        super().__init__()
        self.audio_encoder = audio_encoder
        self.video_encoder = video_encoder

        # embeded_dim
        # num_head
        # kdim
        # vdim
        # dropout
        self.av_attn = MultiheadAttention(
                512,
                8,
                512,
                512,
                0.15,
                encoder_decoder_attention=True
                )

        self.decoder = decoder
Beispiel #2
0
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.encoder_embed_dim
     self.self_attn = MultiheadAttention(
         self.embed_dim,
         args.encoder_attention_heads,
         dropout=args.attention_dropout,
     )
     self.self_attn_layer_norm = LayerNorm(self.embed_dim)
     self.dropout = args.dropout
     self.activation_fn = utils.get_activation_fn(
         activation=getattr(args, 'activation_fn', 'relu'))
     self.activation_dropout = getattr(args, 'activation_dropout', 0)
     if self.activation_dropout == 0:
         # for backwards compatibility with models that use args.relu_dropout
         self.activation_dropout = getattr(args, 'relu_dropout', 0)
     self.normalize_before = args.encoder_normalize_before
     self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
     self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
     self.final_layer_norm = LayerNorm(self.embed_dim)
Beispiel #3
0
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        if args.max_relative_length == -1:
            self.self_attn = MultiheadAttention(
                self.embed_dim, args.encoder_attention_heads,
                dropout=args.attention_dropout,
            )
        else:
            self.self_attn = RelativeMultiheadAttention(
                self.embed_dim, args.encoder_attention_heads,
                args.max_relative_length, dropout=args.attention_dropout, k_only=args.k_only,
            )

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(2)])
Beispiel #4
0
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.encoder_embed_dim
     self.self_attn = MultiheadAttention(
         self.embed_dim,
         args.encoder_attention_heads,
         dropout=args.attention_dropout,
     )
     self.dropout = args.dropout
     self.relu_dropout = args.relu_dropout
     self.fuse_dropout_add = args.fuse_dropout_add
     self.fuse_relu_dropout = args.fuse_relu_dropout
     self.normalize_before = args.encoder_normalize_before
     self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
     self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
     self.maybe_ln1 = MaybeLayerNorm(self.embed_dim,
                                     self.normalize_before,
                                     fuse=args.fuse_layer_norm)
     self.maybe_ln2 = MaybeLayerNorm(self.embed_dim,
                                     self.normalize_before,
                                     fuse=args.fuse_layer_norm)
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.decoder_embed_dim
     self.self_attn = MultiheadAttention(
         self.embed_dim,
         args.decoder_attention_heads,
         dropout=args.attention_dropout,
     )
     self.dropout = args.dropout
     self.relu_dropout = args.relu_dropout
     self.normalize_before = args.decoder_normalize_before
     '''
     self.encoder_attn = MultiheadAttention(
         self.embed_dim, args.decoder_attention_heads,
         dropout=args.attention_dropout,
     )
     '''
     self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
     self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
     self.layer_norms = nn.ModuleList(
         [LayerNorm(self.embed_dim) for i in range(3)])
Beispiel #6
0
 def __init__(
     self,
     embed_dim,
     attention_heads,
     self_attention=True,
     attention_dropout=0.1,
     dropout=0.3,
     normalize_before=False,
     # activation_fn='relu', activation_dropout=0
 ):
     super().__init__()
     self.embed_dim = embed_dim
     self.self_attn = MultiheadAttention(self.embed_dim,
                                         attention_heads,
                                         dropout=attention_dropout,
                                         self_attention=self_attention)
     self.attn_layer_norm = LayerNorm(self.embed_dim)
     self.dropout = dropout
     # self.activation_fn activation_fn
     # self.activation_dropout = activation_dropout
     self.normalize_before = normalize_before
Beispiel #7
0
 def __init__(self,
              args,
              no_encoder_attn=False,
              add_bias_kv=False,
              add_zero_attn=False):
     super().__init__(args,
                      no_encoder_attn=no_encoder_attn,
                      add_bias_kv=add_bias_kv,
                      add_zero_attn=add_zero_attn)
     self.add_context = args.context_position in ["both", "decoder"]
     self.context_attention_type = args.context_decoder_attention_type
     if self.add_context:
         self.context_attn = MultiheadAttention(
             self.embed_dim,
             args.decoder_attention_heads,
             dropout=args.attention_dropout,
             encoder_decoder_attention=True,
         )
         self.context_gating_wi = Linear(self.embed_dim, self.embed_dim)
         self.context_gating_ws = Linear(self.embed_dim, self.embed_dim)
         self.context_attn_layer_norm = LayerNorm(self.embed_dim)
Beispiel #8
0
    def __init__(self, args, dictionary, embed_tokens):
        encoder_layers = args.encoder_layers
        args.encoder_layers = 0
        super().__init__(args, dictionary, embed_tokens)
        self.layers = nn.ModuleList([])
        self.layers.extend(
            [EndorsementDetectorEncoderLayer(args) for i in range(1)])
        args.encoder_layers = encoder_layers

        self.eds_fc1 = Linear(args.encoder_embed_dim,
                              args.encoder_embed_dim,
                              bias=False)
        self.eds_fc2 = Linear(args.encoder_embed_dim,
                              args.encoder_embed_dim,
                              bias=False)
        self.eds_layer_norm = LayerNorm(args.encoder_embed_dim)

        self.self_attn = MultiheadAttention(args.encoder_embed_dim,
                                            args.encoder_attention_heads,
                                            dropout=0,
                                            self_attention=True)
Beispiel #9
0
 def build_self_attention(self,
                          embed_dim,
                          args,
                          add_bias_kv=False,
                          add_zero_attn=False):
     return MultiheadAttention(
         embed_dim,
         args.decoder_attention_heads,
         dropout=args.attention_dropout,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         self_attention=not getattr(args, "cross_self_attention", False),
         q_noise=self.quant_noise,
         qn_block_size=self.quant_noise_block_size,
         positional_embeddings_in_attention=getattr(
             args, "positional_embeddings_in_attention", False),
         symmetric_kv_context_params=getattr(
             args, "decoder_self_symmetric_kv_context_params", False),
         symmetric_kv_positional_params=getattr(
             args, "decoder_self_symmetric_kv_positional_params", False),
     )
Beispiel #10
0
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.encoder_embed_dim
     self.self_attn = MultiheadAttention(
         self.embed_dim, args.encoder_attention_heads,
         dropout=args.attention_dropout,
     )
     self.dropout = args.dropout
     self.relu_dropout = args.relu_dropout
     self.normalize_before = args.encoder_normalize_before
     self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
     self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
     n_layernorm = 2
     self.fc_factor = 1.0
     self.macaron = getattr(args, "macaron", False)
     if self.macaron:
         self.macaron_fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
         self.macaron_fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
         self.fc_factor = 0.5
         n_layernorm += 1
     self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(n_layernorm)])
Beispiel #11
0
    def __init__(
        self,
        embedding_dim: float = 768,
        ffn_embedding_dim: float = 3072,
        num_attention_heads: float = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = 'relu',
        attn_scale_factor: int = 1,
        export: bool = False,
        # new added
        encoder_normalize_before: bool = False,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout
        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.self_attn = MultiheadAttention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            bias=True,
            scale_factor=attn_scale_factor,
        )

        # new added
        self.normalize_before = encoder_normalize_before
        
        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
Beispiel #12
0
    def __init__(self, args, layer_id=-1):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim

        # beg 20191115 multi-hop attention configuration in layer
        self.layer_id = layer_id
        self.attn_type = args.encoder_attn_type  # set as an class attribute considering to use the value in forward method
        self.spec_layers = [
            int(i) for i in args.encoder_spec_attn_layers.split(',') if i != ''
        ]
        if self.attn_type == 'MHDA' and self.layer_id in self.spec_layers:
            self.self_attn = MultiHopDependentAttention(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                self_attention=True)
            print('Self Attention [@Encoder Layer-{}] is MHDA.'.format(
                self.layer_id))
        else:
            self.self_attn = MultiheadAttention(self.embed_dim,
                                                args.encoder_attention_heads,
                                                dropout=args.attention_dropout,
                                                self_attention=True)
            print(
                'Self Attention [@Encoder Layer-{}] is vanilla multi-head attention.'
                .format(self.layer_id))
        # end 20191115

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        self.final_layer_norm = LayerNorm(self.embed_dim)
Beispiel #13
0
    def __init__(
        self,
        embedding_dim: float = 768,
        ffn_embedding_dim: float = 3072,
        num_attention_heads: float = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = "relu",
        layer_norm_first: bool = False,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout

        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.self_attn = MultiheadAttention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            self_attention=True,
        )

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(self.activation_dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.layer_norm_first = layer_norm_first

        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim)
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.context_size = args.context_size

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions
        self.conv_layer_norm = LayerNorm(embed_dim)

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = PositionalEmbedding(
            args.max_source_positions,
            embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerEncoderLayer(args) for i in range(args.encoder_layers)
        ])

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.conv = nn.Conv1d(in_channels=embed_dim,
                              out_channels=embed_dim,
                              kernel_size=(self.context_size, ),
                              padding=(self.context_size - 1) // 2)

        self.self_attn = MultiheadAttention(embed_dim,
                                            args.encoder_attention_heads,
                                            dropout=args.attention_dropout,
                                            self_attention=True)
    def __init__(
        self,
        embedding_dim: float = 768,
        ffn_embedding_dim: float = 3072,
        num_attention_heads: float = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = 'relu',
        add_bias_kv: bool = False,
        add_zero_attn: bool = False,
        export: bool = False,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout

        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)

        self.self_attn = MultiheadAttention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=True
        )

        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
 def build_self_attention(self, embed_dim, args):
     return MultiheadAttention(
         embed_dim,
         args.encoder_attention_heads,
         dropout=args.attention_dropout,
         self_attention=True,
         q_noise=self.quant_noise,
         qn_block_size=self.quant_noise_block_size,
         sigsoftmax=args.sigsoftmax,
         mix_softmax=args.mix_softmax,
         mix_type=args.mix_type,
         temperature=args.temperature,
         pre_drop_mix=args.pre_drop_mix,
         pre_mix=args.pre_mix,
         fix_head_dim=args.fix_head_dim,
         use_div_reg=args.use_div_reg,
         synth_attn_type=args.synth_attn_type,
         synth_hidden_dim=args.synth_hidden_dim,
         synth_factor_dim=args.synth_factor_dim,
         synth_trainable_random=args.synth_trainable_random,
         synth_max_len_seq=args.tokens_per_sample,
     )
 def build_self_attention(self,
                          embed_dim,
                          args,
                          add_bias_kv=False,
                          add_zero_attn=False):
     collaborative_heads = "decoder" in args.collaborative_heads
     key_dim = args.key_dim or embed_dim
     if collaborative_heads:
         print(f"Decoder uses collaborative heads with key_dim={key_dim}.")
     return MultiheadAttention(
         embed_dim,
         args.decoder_attention_heads,
         collaborative_heads=collaborative_heads,
         kdim=key_dim,
         dropout=args.attention_dropout,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         self_attention=
         False,  # not getattr(args, "cross_self_attention", False),
         q_noise=self.quant_noise,
         qn_block_size=self.quant_noise_block_size,
     )
    def __init__(self, args, index):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu')
        )

        if args.encoder_branch_type is None:
            self.self_attn = MultiheadAttention(
                self.embed_dim, args.encoder_attention_heads,
                dropout=args.attention_dropout, self_attention=True,
            )
        else:
            layers = []
            embed_dims = []
            heads = []
            num_types = len(args.encoder_branch_type)
            for layer_type in args.encoder_branch_type:
                embed_dims.append(int(layer_type.split(':')[2]))
                heads.append(int(layer_type.split(':')[3]))
                layers.append(self.get_layer(args, index, embed_dims[-1], heads[-1], layer_type))
            assert sum(embed_dims) == self.embed_dim, (sum(embed_dims), self.embed_dim)
            
            self.self_attn = MultiBranch(layers, embed_dims)

        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)

        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim, init=args.ffn_init)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim, init=args.ffn_init)
        self.final_layer_norm = LayerNorm(self.embed_dim)
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.context_size = args.context_size
        self.self_attn = MultiheadAttention(
            self.embed_dim, args.encoder_attention_heads,
            dropout=args.attention_dropout, self_attention=True, 
        )
        
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu')
        )
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        self.final_layer_norm = LayerNorm(self.embed_dim)
        # define the three convolutional layers with different context sizes
        # the convolutions are implemented as separable convolutions for reducing the number of model parameters
        self.conv1 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size, ), padding=(self.context_size-1)//2)
        self.conv1_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size+2, ), padding=(self.context_size+2-1)//2)
        self.conv2_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, groups=self.embed_dim, kernel_size=(self.context_size+4, ), padding=(self.context_size+4-1)//2)
        self.conv3_sep = nn.Conv1d(in_channels=self.embed_dim, out_channels=self.embed_dim, kernel_size=1)
        self.leakyrelu1 = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.leakyrelu2 = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.leakyrelu3 = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.leakyrelu4 = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # conv4 here is a standard convolution layer, for reshape the concatenation
        self.conv4 = nn.Conv1d(in_channels=self.embed_dim*3, out_channels=self.embed_dim, kernel_size=(self.context_size, ), padding=(self.context_size-1)//2)

        self.conv_layer_norm = LayerNorm(self.embed_dim)
Beispiel #20
0
 def build_self_attention(self,
                          embed_dim,
                          args,
                          index,
                          add_bias_kv=False,
                          add_zero_attn=False):
     if args.decoder_branch_type is None:
         return MultiheadAttention(
             embed_dim,
             args.decoder_attention_heads,
             dropout=args.attention_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
             self_attention=not getattr(args, "cross_self_attention",
                                        False),
             q_noise=self.quant_noise,
             qn_block_size=self.quant_noise_block_size,
         )
     else:
         embed_dims = []
         for layer_type in args.decoder_branch_type:
             embed_dims.append(int(layer_type.split(':')[2]))
         return MultiBranch(self.self_attn_branches, embed_dims)
Beispiel #21
0
 def build_self_attention(self,
                          embed_dim,
                          args,
                          add_bias_kv=False,
                          add_zero_attn=False):
     return MultiheadAttention(
         embed_dim,
         args.decoder_attention_heads,
         dropout=args.attention_dropout,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         self_attention=not getattr(args, "cross_self_attention", False),
         q_noise=self.quant_noise,
         qn_block_size=self.quant_noise_block_size,
         # relative_pos_type=("masked"
         #                    if getattr(args, "use_relative_pos_embeddings", False)
         #                    else None),
         relative_pos_type=None,  ### 
         max_relative_pos=getattr(args, "max_relative_pos", 128),
         heads_share_embeddings=getattr(args, "heads_share_embeddings",
                                        False),
         add_pos_embeddings_to_values=getattr(
             args, "add_pos_embeddings_to_values", False))
Beispiel #22
0
 def get_layer(self, args, index, out_dim, num_heads, layer_type,
               add_bias_kv, add_zero_attn):
     kernel_size = layer_type.split(':')[1]
     if kernel_size == 'default':
         kernel_size = args.decoder_kernel_size_list[index]
     else:
         kernel_size = int(kernel_size)
     layer_type = layer_type.split(':')[0]
     if layer_type == 'lightweight':
         layer = LightweightConv(out_dim,
                                 kernel_size,
                                 padding_l=kernel_size - 1,
                                 weight_softmax=args.weight_softmax,
                                 num_heads=num_heads,
                                 weight_dropout=args.weight_dropout)
     elif layer_type == 'dynamic':
         layer = DynamicConv(out_dim,
                             kernel_size,
                             padding_l=kernel_size - 1,
                             weight_softmax=args.weight_softmax,
                             num_heads=num_heads,
                             weight_dropout=args.weight_dropout)
     elif layer_type == 'attn':
         layer = MultiheadAttention(
             out_dim,
             num_heads,
             dropout=args.attention_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
             self_attention=not getattr(args, "cross_self_attention",
                                        False),
             q_noise=self.quant_noise,
             qn_block_size=self.quant_noise_block_size,
         )
     else:
         raise NotImplementedError
     return layer
    def __init__(self, args):
        super().__init__()
        embed_dim = args.encoder_embed_dim
        self.embed_dim = args.encoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim,
            args.encoder_attention_heads,
            dropout=args.attention_dropout,
            self_attention=True,
        )
#        self.self_attn_layer_norm = SlimmableLayernorm([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3/ 4), self.embed_dim])
        self.self_attn_layer_norm = SlimmableLayernorm([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16),  int(embed_dim * 7 / 16),  int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16),  int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim])
        self.dropout = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, "activation_fn", "relu")
        )
        self.activation_dropout = getattr(args, "activation_dropout", 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, "relu_dropout", 0)
        self.normalize_before = args.encoder_normalize_before
#        self.fc1 = SLinear([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4), self.embed_dim], 
#                           [int(args.encoder_ffn_embed_dim / 4), int(args.encoder_ffn_embed_dim * 2 / 4), int(args.encoder_ffn_embed_dim * 3 / 4), args.encoder_ffn_embed_dim])
#        self.fc2 = SLinear([int(args.encoder_ffn_embed_dim / 4), int(args.encoder_ffn_embed_dim * 2 / 4), int(args.encoder_ffn_embed_dim * 3 / 4), args.encoder_ffn_embed_dim], 
#                           [int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4) , self.embed_dim])
#        self.final_layer_norm = SlimmableLayernorm([int(self.embed_dim / 4), int(self.embed_dim * 2 / 4), int(self.embed_dim * 3 / 4), self.embed_dim])
        self.fc1 = SLinear(embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = SLinear(args.encoder_ffn_embed_dim, embed_dim)

        self.final_layer_norm = SlimmableLayernorm([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16),  int(embed_dim * 7 / 16),  int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16),  int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim])

        self.linear_list = [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16),  int(embed_dim * 7 / 16),  int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16),  int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]
   
        self.ffn_list = [int(args.encoder_ffn_embed_dim * 4 / 16), int(args.encoder_ffn_embed_dim * 5 / 16), int(args.encoder_ffn_embed_dim * 6 / 16), int(args.encoder_ffn_embed_dim * 7 / 16), int(args.encoder_ffn_embed_dim * 8 / 16), int(args.encoder_ffn_embed_dim * 9 / 16), int(args.encoder_ffn_embed_dim * 10 / 16), int(args.encoder_ffn_embed_dim * 11 / 16), int(args.encoder_ffn_embed_dim * 12 / 16), int(args.encoder_ffn_embed_dim * 13 / 16),  int(args.encoder_ffn_embed_dim * 14 / 16), int(args.encoder_ffn_embed_dim * 15 / 16), args.encoder_ffn_embed_dim]

        self.epison = True
        self.epison_value = 3.0
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.encoder_embed_dim
     self.num_branches = args.encoder_branches
     self.num_pffn_branches = args.encoder_pffn_branches
     self.join_pffn = args.join_pffn
     self.branch_dropout = args.branch_dropout
     self.pffn_branch_dropout = args.pffn_branch_dropout
     self.enable_head_dropout = args.enable_head_dropout
     self.self_attn_branches = nn.ModuleList([
         MultiheadAttention(
             self.embed_dim,
             args.encoder_attention_heads,
             dropout=args.attention_dropout,
             self_attention=True,
             head_dropout=self.branch_dropout
             if self.enable_head_dropout else None,
         ) for _ in range(self.num_branches)
     ])
     self.self_attn_layer_norm = LayerNorm(self.embed_dim)
     self.dropout = args.dropout
     self.activation_fn = utils.get_activation_fn(
         activation=getattr(args, 'activation_fn', 'relu'))
     self.activation_dropout = getattr(args, 'activation_dropout', 0)
     if self.activation_dropout == 0:
         # for backwards compatibility with models that use args.relu_dropout
         self.activation_dropout = getattr(args, 'relu_dropout', 0)
     self.normalize_before = args.encoder_normalize_before
     self.fc1_branches = nn.ModuleList([
         Linear(self.embed_dim, args.encoder_ffn_embed_dim)
         for _ in range(self.num_pffn_branches)
     ])
     self.fc2_branches = nn.ModuleList([
         Linear(args.encoder_ffn_embed_dim, self.embed_dim)
         for _ in range(self.num_pffn_branches)
     ])
     self.final_layer_norm = LayerNorm(self.embed_dim)
Beispiel #25
0
    def __init__(
        self,
        embedding_dim: float = 768,
        ffn_embedding_dim: float = 3072,
        num_attention_heads: float = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        encoder_normalize_before: bool = False,
        use_bert_layer_norm: bool = False,
        use_gelu: bool = True,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout
        self.normalize_before = encoder_normalize_before

        # Initialize blocks
        self.activation_fn = gelu if use_gelu else F.relu
        self.self_attn = MultiheadAttention(self.embedding_dim,
                                            num_attention_heads,
                                            dropout=attention_dropout)

        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = (BertLayerNorm(self.embedding_dim)
                                     if use_bert_layer_norm else LayerNorm(
                                         self.embedding_dim, eps=1e-12))
        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = (BertLayerNorm(self.embedding_dim)
                                 if use_bert_layer_norm else LayerNorm(
                                     self.embedding_dim, eps=1e-12))
Beispiel #26
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        decoder_layers = args.decoder_layers
        args.decoder_layers = 0
        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
        self.layers = nn.ModuleList([])
        self.layers.extend([
            EndorsementDetectorDecoderLayer(args, no_encoder_attn)
            for _ in range(1)
        ])
        args.decoder_layers = decoder_layers
        self.padding_idx = embed_tokens.padding_idx
        self.proj_prob = Linear(args.decoder_embed_dim, 1, bias=True)
        self.eds_fc1 = Linear(args.decoder_embed_dim,
                              args.decoder_embed_dim,
                              bias=False)
        self.eds_fc2 = Linear(args.decoder_embed_dim,
                              args.decoder_embed_dim,
                              bias=False)
        self.eds_layer_norm = LayerNorm(args.decoder_embed_dim)

        self.self_attn = MultiheadAttention(args.decoder_embed_dim,
                                            args.decoder_attention_heads,
                                            dropout=0,
                                            self_attention=True)
Beispiel #27
0
class TransformerDecoderLayer(nn.Module):
    """Decoder layer block.

    In the original paper each operation (multi-head attention, encoder
    attention or FFN) is postprocessed with: `dropout -> add residual ->
    layernorm`. In the tensor2tensor code they suggest that learning is more
    robust when preprocessing each layer with layernorm and postprocessing with:
    `dropout -> add residual`. We default to the approach in the paper, but the
    tensor2tensor approach can be enabled by setting
    *args.decoder_normalize_before* to ``True``.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
    """
    def __init__(self, args, no_encoder_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim,
            args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True

        self.onnx_trace = False

    def prepare_for_onnx_export_(self):
        self.onnx_trace = True

    def forward(self,
                x,
                encoder_out,
                encoder_padding_mask,
                incremental_state,
                prev_self_attn_state=None,
                prev_attn_state=None,
                self_attn_mask=None,
                self_attn_padding_mask=None):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
        if prev_self_attn_state is not None:
            if incremental_state is None:
                incremental_state = {}
            prev_key, prev_value = prev_self_attn_state
            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
            self.self_attn._set_input_buffer(incremental_state, saved_state)
        x, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            key_padding_mask=self_attn_padding_mask,
            incremental_state=incremental_state,
            need_weights=False,
            attn_mask=self_attn_mask,
        )
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)

        attn = None
        if self.encoder_attn is not None:
            residual = x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm,
                                      x,
                                      before=True)
            if prev_attn_state is not None:
                if incremental_state is None:
                    incremental_state = {}
                prev_key, prev_value = prev_attn_state
                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
                self.encoder_attn._set_input_buffer(incremental_state,
                                                    saved_state)
            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                static_kv=True,
                need_weights=(not self.training and self.need_attn),
            )
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = residual + x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm,
                                      x,
                                      after=True)

        residual = x
        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.relu_dropout, training=self.training)
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
        if self.onnx_trace:
            saved_state = self.self_attn._get_input_buffer(incremental_state)
            self_attn_state = saved_state["prev_key"], saved_state[
                "prev_value"]
            return x, attn, self_attn_state
        return x, attn

    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
        assert before ^ after
        if after ^ self.normalize_before:
            return layer_norm(x)
        else:
            return x

    def make_generation_fast_(self, need_attn=False, **kwargs):
        self.need_attn = need_attn
    def forward(
            self,
            query,
            key: Optional[Tensor],
            value: Optional[Tensor],
            key_padding_mask: Optional[Tensor] = None,
            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
            need_weights: bool = True,
            static_kv: bool = False,
            attn_mask: Optional[Tensor] = None,
            before_softmax: bool = False,
            need_head_weights: bool = False,
    ) -> Tuple[Tensor, Optional[Tensor]]:
        """Input shape: Time x Batch x Channel
        Args:
            key_padding_mask (ByteTensor, optional): mask to exclude
                keys that are pads, of shape `(batch, src_len)`, where
                padding elements are indicated by 1s.
            need_weights (bool, optional): return the attention weights,
                averaged over heads (default: False).
            attn_mask (ByteTensor, optional): typically used to
                implement causal attention, where the mask prevents the
                attention from looking forward in time (default: None).
            before_softmax (bool, optional): return the raw attention
                weights and values before the attention softmax.
            need_head_weights (bool, optional): return the attention
                weights for each head. Implies *need_weights*. Default:
                return the average attention weights over all heads.
        """
        if need_head_weights:
            need_weights = True

        is_tpu = query.device.type == "xla"

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]

        if (
                not self.onnx_trace
                and not is_tpu  # don't use PyTorch version on TPUs
                and incremental_state is None
                and not static_kv
                # A workaround for quantization to work. Otherwise JIT compilation
                # treats bias in linear module as method.
                and not torch.jit.is_scripting()
        ):
            assert key is not None and value is not None
            return F.multi_head_attention_forward(
                query,
                key,
                value,
                self.embed_dim,
                self.num_heads,
                torch.empty([0]),
                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
                self.bias_k,
                self.bias_v,
                self.add_zero_attn,
                self.dropout_module.p,
                self.out_proj.weight,
                self.out_proj.bias,
                self.training or self.dropout_module.apply_during_inference,
                key_padding_mask,
                need_weights,
                attn_mask,
                use_separate_proj_weight=True,
                q_proj_weight=self.q_proj.weight,
                k_proj_weight=self.k_proj.weight,
                v_proj_weight=self.v_proj.weight,
            )

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if saved_state is not None and "prev_key" in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert self.encoder_decoder_attention and not self.self_attention
                    key = value = None
        else:
            saved_state = None

        if self.self_attention:
            q = self.q_proj(query)
            k = self.k_proj(query)
            v = self.v_proj(query)
        elif self.encoder_decoder_attention:
            # encoder-decoder attention
            q = self.q_proj(query)
            if key is None:
                assert value is None
                k = v = None
            else:
                k = self.k_proj(key)
                v = self.v_proj(key)

        else:
            assert key is not None and value is not None
            q = self.q_proj(query)
            k = self.k_proj(key)
            v = self.v_proj(value)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
                )
            if key_padding_mask is not None:
                key_padding_mask = torch.cat(
                    [
                        key_padding_mask,
                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
                    ],
                    dim=1,
                )

        q = (
            q.contiguous()
                .view(tgt_len, bsz * self.num_heads, self.head_dim)
                .transpose(0, 1)
        )
        if k is not None:
            k = (
                k.contiguous()
                    .view(-1, bsz * self.num_heads, self.head_dim)
                    .transpose(0, 1)
            )
        if v is not None:
            v = (
                v.contiguous()
                    .view(-1, bsz * self.num_heads, self.head_dim)
                    .transpose(0, 1)
            )

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if "prev_key" in saved_state:
                _prev_key = saved_state["prev_key"]
                assert _prev_key is not None
                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    k = prev_key
                else:
                    assert k is not None
                    k = torch.cat([prev_key, k], dim=1)
            if "prev_value" in saved_state:
                _prev_value = saved_state["prev_value"]
                assert _prev_value is not None
                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    v = prev_value
                else:
                    assert v is not None
                    v = torch.cat([prev_value, v], dim=1)
            prev_key_padding_mask: Optional[Tensor] = None
            if "prev_key_padding_mask" in saved_state:
                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
            assert k is not None and v is not None
            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
                key_padding_mask=key_padding_mask,
                prev_key_padding_mask=prev_key_padding_mask,
                batch_size=bsz,
                src_len=k.size(1),
                static_kv=static_kv,
            )

            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
            saved_state["prev_key_padding_mask"] = key_padding_mask
            # In this branch incremental_state is never None
            assert incremental_state is not None
            incremental_state = self._set_input_buffer(incremental_state, saved_state)
        assert k is not None
        src_len = k.size(1)

        # This is part of a workaround to get around fork/join parallelism
        # not supporting Optional types.
        if key_padding_mask is not None and key_padding_mask.dim() == 0:
            key_padding_mask = None

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        if self.add_zero_attn:
            assert v is not None
            src_len += 1
            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
                )
            if key_padding_mask is not None:
                key_padding_mask = torch.cat(
                    [
                        key_padding_mask,
                        torch.zeros(key_padding_mask.size(0), 1).type_as(
                            key_padding_mask
                        ),
                    ],
                    dim=1,
                )

        attn_weights = torch.bmm(q, k.transpose(1, 2))

        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
            if self.onnx_trace:
                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
            attn_weights += attn_mask

        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            if not is_tpu:
                attn_weights = attn_weights.masked_fill(
                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
                    float("-inf"),
                )
            else:
                attn_weights = attn_weights.transpose(0, 2)
                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
                attn_weights = attn_weights.transpose(0, 2)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
        if before_softmax:
            return attn_weights, v

        attn_weights_float = utils.softmax(
            attn_weights, dim=-1, onnx_trace=self.onnx_trace
        )
        attn_weights = attn_weights_float.type_as(attn_weights)
        attn_probs = self.dropout_module(attn_weights)

        assert v is not None
        attn = torch.bmm(attn_probs, v)
        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        if self.onnx_trace and attn.size(1) == 1:
            # when ONNX tracing a single decoder step (sequence length == 1)
            # the transpose is a no-op copy before view, thus unnecessary
            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
        else:
            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)
        attn_weights: Optional[Tensor] = None
        if need_weights:
            attn_weights = attn_weights_float.view(
                bsz, self.num_heads, tgt_len, src_len
            ).transpose(1, 0)
            if not need_head_weights:
                # average attention weights over heads
                attn_weights = attn_weights.mean(dim=0)

        return attn, attn_weights
Beispiel #29
0
    def __init__(self, args, no_encoder_attn=False, mix=False, no_man=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.mix = mix
        self.no_man = no_man

        self.decoder_fc1 = []
        self.decoder_fc2 = []
        self.decoder_self_attn = []
        self.decoder_d_self_attn = []
        self.decoder_ma = []
        self.normalize = []
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        index = 0
        cnt = 0
        self.decoder_seq = args.decoder_seq
        #print('args.decoder_seq', args.decoder_seq)
        for i in range(len(args.decoder_seq)):
            #ff
            if args.decoder_seq[i] == 1:
                t = Linear(self.embed_dim,
                           args.decoder_para[i] * self.embed_dim)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.decoder_fc1.append(t)

                t = Linear(args.decoder_para[i] * self.embed_dim,
                           self.embed_dim)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.decoder_fc2.append(t)

                t = LayerNorm(self.embed_dim)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.normalize.append(t)
            #sa
            elif args.decoder_seq[i] == 2:
                t = MultiheadAttention(
                    self.embed_dim,
                    args.decoder_para[i],
                    dropout=args.attention_dropout,
                )
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.decoder_self_attn.append(t)

                t = LayerNorm(self.embed_dim)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.normalize.append(t)
            #dsa
            elif args.decoder_seq[i] == 3:
                t = MultiheadAttention(self.embed_dim,
                                       args.decoder_para[i],
                                       dropout=args.attention_dropout,
                                       re_weight_m=1,
                                       max_len=args.max_target_positions)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.decoder_d_self_attn.append(t)

                t = LayerNorm(self.embed_dim)
                self.add_module('sublayer_%d' % cnt, t)
                cnt += 1
                self.normalize.append(t)
            #ma
            elif args.decoder_seq[i] == 4:
                if no_encoder_attn:
                    self.decoder_ma.append([])
                    self.normalize.append([])

                else:
                    t = MultiheadAttention(
                        self.embed_dim,
                        args.decoder_para[i],
                        dropout=args.attention_dropout,
                    )
                    self.add_module('sublayer_%d' % cnt, t)
                    cnt += 1
                    self.decoder_ma.append(t)

                    t = LayerNorm(self.embed_dim)
                    self.add_module('sublayer_%d' % cnt, t)
                    cnt += 1
                    self.normalize.append(t)

                index += 1
        """        
        for i in self.decoder_fc1:
            i.cuda()
        for i in self.decoder_fc2:
            i.cuda()
        for i in self.decoder_self_attn:
            i.cuda()
        for i in self.decoder_d_self_attn:
            i.cuda()
        for i in self.decoder_ma:
            i.cuda()
        for i in self.normalize:
            i.cuda()
        """
        #if no_encoder_attn:
        #    self.encoder_attn = None
        #    self.encoder_attn_layer_norm = None

        self.need_attn = True

        self.onnx_trace = False
Beispiel #30
0
    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.conv_dim = args.decoder_conv_dim
        if args.decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.decoder_conv_type == "lightweight":
            self.conv = LightweightConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        elif args.decoder_conv_type == "dynamic":
            self.conv = DynamicConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.relu_dropout_module = FairseqDropout(
            args.relu_dropout, module_name=self.__class__.__name__
        )
        self.input_dropout_module = FairseqDropout(
            args.input_dropout, module_name=self.__class__.__name__
        )
        self.normalize_before = args.decoder_normalize_before

        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True