def __init__( self, size, self_attn, src_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False, ): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__(self, size, self_attn, feed_forward, dropout_rate): """Construct an DecoderLayer object.""" super().__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size
def __init__( self, idim, enc_arch, input_layer="linear", repeat_block=0, self_attn_type="selfattn", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", conv_mod_activation_type="relu", normalize_before=True, padding_idx=-1, ): """Construct an Transformer encoder object.""" super().__init__() self.embed, self.encoders, self.enc_out = build_blocks( "encoder", idim, input_layer, enc_arch, repeat_block=repeat_block, self_attn_type=self_attn_type, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, conv_mod_activation_type=conv_mod_activation_type, padding_idx=padding_idx, ) self.normalize_before = normalize_before if self.normalize_before: self.after_norm = LayerNorm(self.enc_out)
def __init__( self, odim, edim, jdim, dec_arch, input_layer="embed", repeat_block=0, joint_activation_type="tanh", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", dropout_rate_embed=0.0, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) self.embed, self.decoders, ddim = build_blocks( "decoder", odim, input_layer, dec_arch, repeat_block=repeat_block, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, dropout_rate_embed=dropout_rate_embed, padding_idx=blank, ) self.after_norm = LayerNorm(ddim) self.joint_network = JointNetwork(odim, edim, ddim, jdim, joint_activation_type) self.dunits = ddim self.odim = odim self.blank = blank
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info("decoder self-attention layer type = lightweight convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info( "decoder self-attention layer " "type = lightweight convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info("decoder self-attention layer type = dynamic convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, macaron_style=False, pos_enc_layer_type="abs_pos", selfattention_layer_type="selfattn", activation_type="swish", use_cnn_module=False, cnn_module_kernel=31, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before # self-attention module definition if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) # feed-forward module definition if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( attention_dim, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "conv2d-scaled-pos-enc": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "encoder self-attention layer type = lightweight convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("encoder self-attention layer " "type = lightweight convolution 2-dimentional") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "encoder self-attention layer type = dynamic convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "encoder self-attention layer type = dynamic convolution 2-dimentional" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)