Exemple #1
0
    def __init__(
        self,
        attention_dropout,
        decoder_attention_heads,
        self_attention_heads,
        decoder_conv_dim,
        # ARBABU: need to remove these two type parameters
        decoder_conv_type,
        attention_type,
        self_attention_type,
        decoder_embed_dim,
        decoder_ffn_embed_dim,
        decoder_glu,
        decoder_normalize_before,
        dropout,
        input_dropout,
        relu_dropout,
        need_attention,
        convolution_type,
        conv=None,
        self_attention=None,
        attention=None,
    ):
        super().__init__()
        self.embed_dim = decoder_embed_dim
        self.conv_dim = decoder_conv_dim
        if decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = PlaceholderIdentity()
        self.conv = conv
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.relu_dropout = relu_dropout
        self.input_dropout = input_dropout
        self.normalize_before = decoder_normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if attention is None:
            self.no_encoder_attn = True
            self.encoder_attn = PlaceholderAttentionIdentity()
            self.encoder_attn_layer_norm = PlaceholderIdentity()
        else:
            self.no_encoder_attn = False
            self.encoder_attn = attention
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
        if self_attention is None:
            self.has_self_attn = False
            self.self_attn = PlaceholderAttentionIdentity()
        else:
            self.has_self_attn = True
            self.self_attn = self_attention
        self.fc1 = Linear(self.embed_dim, decoder_ffn_embed_dim)
        self.fc2 = Linear(decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = need_attention
Exemple #2
0
    def __init__(
        self,
        dropout,
        encoder_conv_dim,
        encoder_conv_type,
        self_attention_type,
        encoder_embed_dim,
        encoder_ffn_embed_dim,
        self_attention_heads,
        encoder_glu,
        encoder_normalize_before,
        input_dropout,
        relu_dropout,
        convolution_type,
        conv=None,
        self_attention=None,
    ):
        super().__init__()
        self.embed_dim = encoder_embed_dim
        self.conv_dim = encoder_conv_dim

        if encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        self.conv = conv
        self.linear2 = Linear(self.conv_dim, self.embed_dim)
        self.dropout = dropout
        self.relu_dropout = relu_dropout
        self.input_dropout = input_dropout
        self.normalize_before = encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, encoder_ffn_embed_dim)
        self.fc2 = Linear(encoder_ffn_embed_dim, self.embed_dim)

        self.layer_norm1 = LayerNorm(self.embed_dim)
        self.layer_norm2 = LayerNorm(self.embed_dim)
        if self_attention is None:
            self.has_self_attn = False
            self.self_attn = PlaceholderAttentionIdentity()
        else:
            self.has_self_attn = True
            self.self_attn = self_attention
Exemple #3
0
class LightConvDecoderLayer(PyTextSeq2SeqModule):
    class Config(ConfigBase):
        attention_dropout: float = 0.0
        decoder_attention_heads: int = 1
        self_attention_heads: int = 1
        decoder_conv_dim: int = 128
        decoder_conv_type: Union[
            LightweightConv.Config, PlaceholderIdentity.Config
        ] = LightweightConv.Config()
        attention_type: Union[
            MultiheadAttention.Config, None
        ] = MultiheadAttention.Config()
        self_attention_type: Optional[MultiheadAttention.Config] = None
        decoder_embed_dim: int = 128
        decoder_ffn_embed_dim: int = 512
        decoder_glu: bool = True
        decoder_normalize_before: bool = False
        dropout: float = 0.1
        input_dropout: float = 0.1
        relu_dropout: float = 0.0
        need_attention: bool = True
        convolution_type: str = "causal"

    @classmethod
    def from_config(cls, config, kernel_size):
        conv = create_module(
            config.decoder_conv_type,
            input_size=config.decoder_conv_dim,
            kernel_size=kernel_size,
            convolution_type=config.convolution_type,
        )
        if config.attention_type is not None:
            attention = create_module(
                config.attention_type,
                config.decoder_embed_dim,
                config.decoder_attention_heads,
            )
        else:
            attention = None
        if config.self_attention_type is not None:
            self_attention = create_module(
                config.self_attention_type,
                config.decoder_embed_dim,
                config.self_attention_heads,
            )
        else:
            self_attention = None
        return cls(
            **config._asdict(),
            conv=conv,
            self_attention=self_attention,
            attention=attention
        )

    def __init__(
        self,
        attention_dropout,
        decoder_attention_heads,
        self_attention_heads,
        decoder_conv_dim,
        # ARBABU: need to remove these two type parameters
        decoder_conv_type,
        attention_type,
        self_attention_type,
        decoder_embed_dim,
        decoder_ffn_embed_dim,
        decoder_glu,
        decoder_normalize_before,
        dropout,
        input_dropout,
        relu_dropout,
        need_attention,
        convolution_type,
        conv=None,
        self_attention=None,
        attention=None,
    ):
        super().__init__()
        self.embed_dim = decoder_embed_dim
        self.conv_dim = decoder_conv_dim
        if decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = PlaceholderIdentity()
        self.conv = conv
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.relu_dropout = relu_dropout
        self.input_dropout = input_dropout
        self.normalize_before = decoder_normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if attention is None:
            self.no_encoder_attn = True
            self.encoder_attn = PlaceholderAttentionIdentity()
            self.encoder_attn_layer_norm = PlaceholderIdentity()
        else:
            self.no_encoder_attn = False
            self.encoder_attn = attention
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
        if self_attention is None:
            self.has_self_attn = False
            self.self_attn = PlaceholderAttentionIdentity()
        else:
            self.has_self_attn = True
            self.self_attn = self_attention
        self.fc1 = Linear(self.embed_dim, decoder_ffn_embed_dim)
        self.fc2 = Linear(decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = need_attention

    def forward(
        self,
        x,
        encoder_out: Tensor,
        encoder_padding_mask: Optional[Tensor],
        decoder_padding_mask: Optional[Tensor],
        incremental_state: Optional[Dict[str, Tensor]],
    ):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        normalize = self.maybe_layer_norm(before=True)
        if normalize:
            x = self.conv_layer_norm(x)
        if self.has_self_attn:
            x, _ = self.self_attn(
                x,
                key=x,
                value=x,
                key_padding_mask=decoder_padding_mask,
                need_weights=False,
                incremental_state=incremental_state,
            )
            x = residual + x
            residual = x
        x = F.dropout(x, p=self.input_dropout, training=self.training)
        x = self.linear1(x)
        x = self.act(x)
        if decoder_padding_mask is not None:
            x = x.masked_fill(decoder_padding_mask.transpose(0, 1).unsqueeze(2), 0)
        x = self.conv(x, incremental_state=incremental_state)
        x = self.linear2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        normalize = self.maybe_layer_norm(after=True)
        if normalize:
            x = self.conv_layer_norm(x)

        attn: Optional[Tensor] = None
        if not self.no_encoder_attn:
            residual = x
            normalize = self.maybe_layer_norm(before=True)
            if normalize:
                x = self.encoder_attn_layer_norm(x)
            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                need_weights=(not self.training and self.need_attn),
            )
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = residual + x
            normalize = self.maybe_layer_norm(after=True)
            if normalize:
                x = self.encoder_attn_layer_norm(x)

        residual = x
        normalize = self.maybe_layer_norm(before=True)
        if normalize:
            x = self.final_layer_norm(x)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.relu_dropout, training=self.training)
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        normalize = self.maybe_layer_norm(after=True)
        if normalize:
            x = self.final_layer_norm(x)
        return x, attn

    def maybe_layer_norm(self, before: bool = False, after: bool = False):
        """This a utility function which helps to control the layer norm behavior
        `before` and `after` specific components using one variable in config.
        If self.normalize_before is set to True, output is true only when `before`
        is True
        """
        assert before ^ after, "Incorrect usage"
        return after ^ self.normalize_before

    def reorder_incremental_state(
        self, incremental_state: Dict[str, Tensor], new_order: Tensor
    ):
        self.self_attn.reorder_incremental_state(incremental_state, new_order)
        self.encoder_attn.reorder_incremental_state(incremental_state, new_order)
        self.conv.reorder_incremental_state(incremental_state, new_order)

    def extra_repr(self):
        return (
            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
                self.dropout,
                self.relu_dropout,
                self.input_dropout,
                self.normalize_before,
            )
        )