def __init__(self,
                 idim,
                 selfattention_layer_type="selfattn",
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 attention_dropout_rate=0.0,
                 input_layer="conv2d",
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False,
                 positionwise_layer_type="linear",
                 positionwise_conv_kernel_size=1):
        super(Encoder, self).__init__()

        self.conv_subsampling_factor = 1
        self.embed = torch.nn.Sequential(
            input_layer,
            pos_enc_class(attention_dim, positional_dropout_rate),
        )
        self.normalize_before = normalize_before
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type,
            attention_dim,
            linear_units,
            dropout_rate,
            positionwise_conv_kernel_size,
        )
        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     attention_dropout_rate),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
Beispiel #2
0
 def __init__(
         self,
         odim,
         selfattention_layer_type="selfattn",
         attention_dim=256,
         attention_heads=4,
         conv_wshare=4,
         conv_kernel_length=11,
         conv_usebias=False,
         linear_units=2048,
         num_blocks=6,
         dropout_rate=0.1,
         positional_dropout_rate=0.1,
         self_attention_dropout_rate=0.0,
         src_attention_dropout_rate=0.0,
         input_layer="embed",
         use_output_layer=True,
         pos_enc_class=PositionalEncoding,
         normalize_before=True,
         concat_after=False,
 ):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         num_blocks,
         lambda lnum: DecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
             dropout_rate,
             normalize_before,
             concat_after,
         ),
     )
     self.selfattention_layer_type = selfattention_layer_type
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Beispiel #3
0
    def __init__(self,
                 idim,
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 attention_dropout_rate=0.0,
                 input_layer="conv2d",
                 normalize_before=True,
                 concat_after=False,
                 positionwise_conv_kernel_size=1,
                 macaron_style=False,
                 use_cnn_module=False,
                 cnn_module_kernel=31,
                 zero_triu=False):
        """Construct a Conformer object."""
        super(Conformer, self).__init__()

        activation = Swish()

        self.conv_subsampling_factor = 1

        if isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                RelPositionalEncoding(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                RelPositionalEncoding(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)

        self.normalize_before = normalize_before

        # self-attention module definition
        encoder_selfattn_layer = RelPositionMultiHeadedAttention
        encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                       attention_dropout_rate, zero_triu)

        # feed-forward module definition
        positionwise_layer = MultiLayeredConv1d
        positionwise_layer_args = (
            attention_dim,
            linear_units,
            positionwise_conv_kernel_size,
            dropout_rate,
        )

        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)

        self.encoders = repeat(
            num_blocks, lambda lnum: EncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args)
                if macaron_style else None,
                convolution_layer(*convolution_layer_args) if use_cnn_module
                else None, dropout_rate, normalize_before, concat_after))
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)