Example #1
0
    def __init__(self,
                 embed_dim,
                 conv_dim,
                 num_heads,
                 kernel_size,
                 weight_dropout=0.1,
                 dropout=0.3,
                 input_dropout=0.0,
                 weight_softmax=True,
                 encoder_glu=False,
                 normalize_before=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.conv_dim = conv_dim

        if encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None

        self.conv = DynamicConv(self.conv_dim,
                                kernel_size,
                                padding_l=kernel_size - 1,
                                weight_softmax=weight_softmax,
                                num_heads=num_heads,
                                weight_dropout=weight_dropout)
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.input_dropout = input_dropout
        self.normalize_before = normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)
Example #2
0
    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.conv_dim = args.decoder_conv_dim
        if args.decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.decoder_conv_type == "lightweight":
            self.conv = LightweightConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        elif args.decoder_conv_type == "dynamic":
            self.conv = DynamicConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.relu_dropout_module = FairseqDropout(
            args.relu_dropout, module_name=self.__class__.__name__)
        self.input_dropout_module = FairseqDropout(
            args.input_dropout, module_name=self.__class__.__name__)
        self.normalize_before = args.decoder_normalize_before

        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True
Example #3
0
    def __init__(self, args, no_encoder_attn=False, kernel_size=0, use_linear_se=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.conv_dim = args.decoder_conv_dim
        if args.decoder_glu:
            if use_linear_se:
                self.linear1 = LinearSE(self.embed_dim, 2*self.conv_dim)
            else:
                self.linear1 = Linear(self.embed_dim, 2*self.conv_dim)
            self.act = nn.GLU()
        else:
            if use_linear_se:
                self.linear1 = LinearSE(self.embed_dim, self.conv_dim)
            else:
                self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.decoder_conv_type == 'lightweight':
            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
                                        weight_softmax=args.weight_softmax,
                                        num_heads=args.decoder_attention_heads,
                                        weight_dropout=args.weight_dropout)
        elif args.decoder_conv_type == 'dynamic':
            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1,
                                    weight_softmax=args.weight_softmax,
                                    num_heads=args.decoder_attention_heads,
                                    weight_dropout=args.weight_dropout)
        else:
            raise NotImplementedError
        if use_linear_se:
            self.linear2 = LinearSE(self.conv_dim, self.embed_dim)
        else:
            self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.decoder_normalize_before

        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim, args.decoder_attention_heads,
                dropout=args.attention_dropout, encoder_decoder_attention=True
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        if use_linear_se:
            self.fc1 = LinearSE(self.embed_dim, args.decoder_ffn_embed_dim)
            self.fc2 = LinearSE(args.decoder_ffn_embed_dim, self.embed_dim)
        else:
            self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
            self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True
Example #4
0
 def get_layer(self, args, index, out_dim, num_heads, layer_type):
     kernel_size = layer_type.split(':')[1]
     if kernel_size == 'default':
         kernel_size = args.encoder_kernel_size_list[index]
     else:
         kernel_size = int(kernel_size)
     padding_l = kernel_size // 2 if kernel_size % 2 == 1 else (
         (kernel_size - 1) // 2, kernel_size // 2)
     if 'lightweight' in layer_type:
         layer = LightweightConv(out_dim,
                                 kernel_size,
                                 padding_l=padding_l,
                                 weight_softmax=args.weight_softmax,
                                 num_heads=num_heads,
                                 weight_dropout=args.weight_dropout)
     elif 'dynamic' in layer_type:
         layer = DynamicConv(out_dim,
                             kernel_size,
                             padding_l=padding_l,
                             weight_softmax=args.weight_softmax,
                             num_heads=num_heads,
                             weight_dropout=args.weight_dropout)
     elif 'attn' in layer_type:
         layer = MultiheadAttention(
             out_dim,
             num_heads,
             dropout=args.attention_dropout,
             self_attention=True,
             q_noise=self.quant_noise,
             qn_block_size=self.quant_noise_block_size,
         )
     else:
         raise NotImplementedError
     return layer
Example #5
0
    def __init__(self, args, kernel_size=0):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.conv_dim = args.encoder_conv_dim
        padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)

        if args.encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2*self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.encoder_conv_type == 'lightweight':
            self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l,
                                        weight_softmax=args.weight_softmax,
                                        num_heads=args.encoder_attention_heads,
                                        weight_dropout=args.weight_dropout)
        elif args.encoder_conv_type == 'dynamic':
            self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l,
                                    weight_softmax=args.weight_softmax,
                                    num_heads=args.encoder_attention_heads,
                                    weight_dropout=args.weight_dropout)
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
 def get_layer(self, args, index, out_dim, num_heads, layer_type, add_bias_kv, add_zero_attn):
     kernel_size = layer_type.split(':')[1]
     if kernel_size == 'default':
         kernel_size = args.decoder_kernel_size_list[index]
     else:
         kernel_size = int(kernel_size)
     layer_type = layer_type.split(':')[0]
     if layer_type == 'lightweight':
         layer = LightweightConv(
             out_dim, kernel_size, padding_l=kernel_size-1,
             weight_softmax=args.weight_softmax, num_heads=num_heads,
             weight_dropout=args.weight_dropout, with_linear=args.conv_linear,
         )
     elif layer_type == 'dynamic':
         layer = DynamicConv(
             out_dim, kernel_size, padding_l=kernel_size-1,
             weight_softmax=args.weight_softmax, num_heads=num_heads,
             weight_dropout=args.weight_dropout, with_linear=args.conv_linear,
             glu=args.decoder_glu,
         )
     elif layer_type == 'attn':
         layer = MultiheadAttention(
             embed_dim=out_dim,
             num_heads=num_heads,
             dropout=args.attention_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
             self_attention=True,
         )
     else:
         raise NotImplementedError
     return layer
Example #7
0
 def get_layer(self, args, index, out_dim, num_heads, layer_type,
               add_bias_kv, add_zero_attn):
     kernel_size = layer_type.split(':')[1]
     if kernel_size == 'default':
         kernel_size = args.decoder_kernel_size_list[index]
     else:
         kernel_size = int(kernel_size)
     layer_type = layer_type.split(':')[0]
     if layer_type == 'lightweight':
         layer = LightweightConv(out_dim,
                                 kernel_size,
                                 padding_l=kernel_size - 1,
                                 weight_softmax=args.weight_softmax,
                                 num_heads=num_heads,
                                 weight_dropout=args.weight_dropout)
     elif layer_type == 'dynamic':
         layer = DynamicConv(out_dim,
                             kernel_size,
                             padding_l=kernel_size - 1,
                             weight_softmax=args.weight_softmax,
                             num_heads=num_heads,
                             weight_dropout=args.weight_dropout)
     elif layer_type == 'attn':
         layer = MultiheadAttention(
             out_dim,
             num_heads,
             dropout=args.attention_dropout,
             add_bias_kv=add_bias_kv,
             add_zero_attn=add_zero_attn,
             self_attention=not getattr(args, "cross_self_attention",
                                        False),
             q_noise=self.quant_noise,
             qn_block_size=self.quant_noise_block_size,
         )
     else:
         raise NotImplementedError
     return layer
Example #8
0
class LightConvDecoderLayer(nn.Module):
    """Decoder layer block.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
            Default: ``False``
        kernel_size: kernel size of the convolution
    """

    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.conv_dim = args.decoder_conv_dim
        if args.decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.decoder_conv_type == "lightweight":
            self.conv = LightweightConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        elif args.decoder_conv_type == "dynamic":
            self.conv = DynamicConv(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
            )
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.relu_dropout_module = FairseqDropout(
            args.relu_dropout, module_name=self.__class__.__name__
        )
        self.input_dropout_module = FairseqDropout(
            args.input_dropout, module_name=self.__class__.__name__
        )
        self.normalize_before = args.decoder_normalize_before

        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True

    def forward(
        self,
        x,
        encoder_out,
        encoder_padding_mask,
        incremental_state,
        prev_conv_state=None,
        prev_attn_state=None,
        conv_mask=None,
        conv_padding_mask=None,
    ):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        x = self.maybe_layer_norm(self.conv_layer_norm, x, before=True)
        if prev_conv_state is not None:
            if incremental_state is None:
                incremental_state = {}
            self.conv._set_input_buffer(incremental_state, prev_conv_state)
        x = self.input_dropout_module(x)
        x = self.linear1(x)
        if self.act is not None:
            x = self.act(x)
        x = self.conv(x, incremental_state=incremental_state)
        x = self.linear2(x)
        x = self.dropout_module(x)
        x = residual + x
        x = self.maybe_layer_norm(self.conv_layer_norm, x, after=True)

        attn = None
        if self.encoder_attn is not None:
            residual = x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
            if prev_attn_state is not None:
                if incremental_state is None:
                    incremental_state = {}
                prev_key, prev_value = prev_attn_state
                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                static_kv=True,
                need_weights=(not self.training and self.need_attn),
            )
            x = self.dropout_module(x)
            x = residual + x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)

        residual = x
        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
        x = F.relu(self.fc1(x))
        x = self.relu_dropout_module(x)
        x = self.fc2(x)
        x = self.dropout_module(x)
        x = residual + x
        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
        return x, attn

    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
        assert before ^ after
        if after ^ self.normalize_before:
            return layer_norm(x)
        else:
            return x

    def make_generation_fast_(self, need_attn=False, **kwargs):
        self.need_attn = need_attn

    def extra_repr(self):
        return (
            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
                self.dropout_module.p,
                self.relu_dropout_module.p,
                self.input_dropout_module.p,
                self.normalize_before,
            )
        )
Example #9
0
class DynamicConvDecoderLayer(nn.Module):
    def __init__(self,
                 embed_dim,
                 conv_dim,
                 num_heads,
                 kernel_size,
                 weight_dropout=0.1,
                 dropout=0.3,
                 input_dropout=0.0,
                 weight_softmax=True,
                 encoder_glu=False,
                 normalize_before=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.conv_dim = conv_dim

        if encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None

        self.conv = DynamicConv(self.conv_dim,
                                kernel_size,
                                padding_l=kernel_size - 1,
                                weight_softmax=weight_softmax,
                                num_heads=num_heads,
                                weight_dropout=weight_dropout)
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.input_dropout = input_dropout
        self.normalize_before = normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)

    def forward(self,
                x,
                incremental_state=None,
                prev_conv_state=None,
                **unused):

        residual = x
        x = self.maybe_layer_norm(x, before=True)
        if prev_conv_state is not None:
            if incremental_state is None:
                incremental_state = {}
            self.conv._set_input_buffer(incremental_state, prev_conv_state)
        x = F.dropout(x, p=self.input_dropout, training=self.training)
        x = self.linear1(x)
        if self.act is not None:
            x = self.act(x)
        x = self.conv(x, incremental_state=incremental_state)
        x = self.linear2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(x, after=True)

        return x, None

    def maybe_layer_norm(self, x, before=False, after=False):
        assert before ^ after
        if after ^ self.normalize_before:
            return self.conv_layer_norm(x)
        else:
            return x

    def extra_repr(self):
        return 'dropout={}, input_dropout={}, normalize_before={}'.format(
            self.dropout, self.input_dropout, self.normalize_before)