def __init__(
     self,
     size,
     self_attn,
     src_attn,
     feed_forward,
     dropout_rate,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an DecoderLayer object."""
     super(DecoderLayer, self).__init__()
     self.size = size
     self.self_attn = self_attn
     self.src_attn = src_attn
     self.feed_forward = feed_forward
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.norm3 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     if self.concat_after:
         self.concat_linear1 = nn.Linear(size + size, size)
         self.concat_linear2 = nn.Linear(size + size, size)
Example #2
0
    def __init__(self, size, self_attn, feed_forward, dropout_rate):
        """Construct an DecoderLayer object."""
        super().__init__()

        self.self_attn = self_attn
        self.feed_forward = feed_forward

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)

        self.dropout = nn.Dropout(dropout_rate)

        self.size = size
    def __init__(
        self,
        idim,
        enc_arch,
        input_layer="linear",
        repeat_block=0,
        self_attn_type="selfattn",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        conv_mod_activation_type="relu",
        normalize_before=True,
        padding_idx=-1,
    ):
        """Construct an Transformer encoder object."""
        super().__init__()

        self.embed, self.encoders, self.enc_out = build_blocks(
            "encoder",
            idim,
            input_layer,
            enc_arch,
            repeat_block=repeat_block,
            self_attn_type=self_attn_type,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            conv_mod_activation_type=conv_mod_activation_type,
            padding_idx=padding_idx,
        )

        self.normalize_before = normalize_before

        if self.normalize_before:
            self.after_norm = LayerNorm(self.enc_out)
Example #4
0
    def __init__(
        self,
        odim,
        edim,
        jdim,
        dec_arch,
        input_layer="embed",
        repeat_block=0,
        joint_activation_type="tanh",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        dropout_rate_embed=0.0,
        blank=0,
    ):
        """Construct a Decoder object for transformer-transducer models."""
        torch.nn.Module.__init__(self)

        self.embed, self.decoders, ddim = build_blocks(
            "decoder",
            odim,
            input_layer,
            dec_arch,
            repeat_block=repeat_block,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            dropout_rate_embed=dropout_rate_embed,
            padding_idx=blank,
        )

        self.after_norm = LayerNorm(ddim)

        self.joint_network = JointNetwork(odim, edim, ddim, jdim, joint_activation_type)

        self.dunits = ddim
        self.odim = odim

        self.blank = blank
 def __init__(
     self,
     odim,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     self._register_load_state_dict_pre_hook(_pre_hook)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 MultiHeadedAttention(
                     attention_heads, attention_dim, self_attention_dropout_rate
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv":
         logging.info("decoder self-attention layer type = lightweight convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv2d":
         logging.info(
             "decoder self-attention layer "
             "type = lightweight convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv":
         logging.info("decoder self-attention layer type = dynamic convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv2d":
         logging.info(
             "decoder self-attention layer type = dynamic convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     self.selfattention_layer_type = selfattention_layer_type
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Example #6
0
    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        macaron_style=False,
        pos_enc_layer_type="abs_pos",
        selfattention_layer_type="selfattn",
        activation_type="swish",
        use_cnn_module=False,
        cnn_module_kernel=31,
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()

        activation = get_activation(activation_type)
        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        elif pos_enc_layer_type == "scaled_abs_pos":
            pos_enc_class = ScaledPositionalEncoding
        elif pos_enc_layer_type == "rel_pos":
            assert selfattention_layer_type == "rel_selfattn"
            pos_enc_class = RelPositionalEncoding
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(idim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "vgg2l":
            self.embed = VGG2L(idim, attention_dim)
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(idim,
                                   attention_dim,
                                   padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before

        # self-attention module definition
        if selfattention_layer_type == "selfattn":
            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = (
                attention_heads,
                attention_dim,
                attention_dropout_rate,
            )
        elif selfattention_layer_type == "rel_selfattn":
            assert pos_enc_layer_type == "rel_pos"
            encoder_selfattn_layer = RelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (
                attention_heads,
                attention_dim,
                attention_dropout_rate,
            )
        else:
            raise ValueError("unknown encoder_attn_layer: " +
                             selfattention_layer_type)

        # feed-forward module definition
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                dropout_rate,
                activation,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")

        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)

        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args)
                if macaron_style else None,
                convolution_layer(*convolution_layer_args)
                if use_cnn_module else None,
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
Example #7
0
    def __init__(
        self,
        idim,
        selfattention_layer_type="selfattn",
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length=11,
        conv_usebias=False,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()
        self._register_load_state_dict_pre_hook(_pre_hook)

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(idim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
        elif input_layer == "conv2d-scaled-pos-enc":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate)
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate)
        elif input_layer == "vgg2l":
            self.embed = VGG2L(idim, attention_dim)
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(idim,
                                   attention_dim,
                                   padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type,
            attention_dim,
            linear_units,
            dropout_rate,
            positionwise_conv_kernel_size,
        )
        if selfattention_layer_type == "selfattn":
            logging.info("encoder self-attention layer type = self-attention")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    MultiHeadedAttention(attention_heads, attention_dim,
                                         attention_dropout_rate),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "lightconv":
            logging.info(
                "encoder self-attention layer type = lightweight convolution")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    LightweightConvolution(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "lightconv2d":
            logging.info("encoder self-attention layer "
                         "type = lightweight convolution 2-dimentional")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    LightweightConvolution2D(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "dynamicconv":
            logging.info(
                "encoder self-attention layer type = dynamic convolution")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    DynamicConvolution(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "dynamicconv2d":
            logging.info(
                "encoder self-attention layer type = dynamic convolution 2-dimentional"
            )
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    DynamicConvolution2D(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)