Ejemplo n.º 1
0
 def __init__(self, embed_dim=512 // 2, frame_dim=80, dropout=0.5):
     super().__init__()
     self.conv = nn.ModuleList()
     self.dropout = nn.Dropout(0.5)
     self.conv.append(
         nn.Sequential(
             ConvNorm(frame_dim,
                      embed_dim,
                      kernel_size=5,
                      stride=1,
                      padding=int((5 - 1) / 2),
                      dilation=1,
                      w_init_gain='tanh'), nn.BatchNorm1d(embed_dim)))
     for i in range(1, 4):
         self.conv.append(
             nn.Sequential(
                 ConvNorm(embed_dim,
                          embed_dim,
                          kernel_size=5,
                          stride=1,
                          padding=int((5 - 1) / 2),
                          dilation=1,
                          w_init_gain='tanh'), nn.BatchNorm1d(embed_dim)))
     self.conv.append(
         nn.Sequential(
             ConvNorm(embed_dim,
                      frame_dim,
                      kernel_size=5,
                      stride=1,
                      padding=int((5 - 1) / 2),
                      dilation=1,
                      w_init_gain='tanh'), nn.BatchNorm1d(frame_dim)))
Ejemplo n.º 2
0
    def __init__(self, hparams):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(hparams.postnet_embedding_dim))
        )

        for i in range(1, hparams.postnet_n_convolutions - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(hparams.postnet_embedding_dim,
                             hparams.postnet_embedding_dim,
                             kernel_size=hparams.postnet_kernel_size, stride=1,
                             padding=int((hparams.postnet_kernel_size - 1) / 2),
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(hparams.postnet_embedding_dim))
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(hparams.n_mel_channels))
        )
Ejemplo n.º 3
0
    def __init__(self, hparams, supervised=False):
        super(GMVAE_revised, self).__init__()
        self.latent_embedding_dim = hparams.latent_embedding_dim
        self.supervised = supervised
        convolutions = []
        conv_layer_1 = nn.Sequential(
            ConvNorm(hparams.n_mel_channels,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_1)

        conv_layer_2 = nn.Sequential(
            ConvNorm(hparams.latent_embedding_dim,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_2)

        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.latent_embedding_dim,
                            int(hparams.latent_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)

        # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1)
        #
        # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1

        self.linear_projection = LinearNorm(
            hparams.latent_embedding_dim,
            int(hparams.latent_embedding_dim / 2))

        self.linear_projection_mean = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.linear_projection_variance = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.fc3 = nn.Linear(hparams.latent_out_dim,
                             int(hparams.latent_embedding_dim / 2))

        self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2),
                             hparams.latent_embedding_dim)
Ejemplo n.º 4
0
    def __init__(self, hparams):
        super(Encoder, self).__init__()

        convolutions = []

        for _ in range(hparams.encoder_n_convolutions):

            conv_layer = nn.Sequential(
                ConvNorm(hparams.encoder_embedding_dim,
                         hparams.encoder_embedding_dim,
                         kernel_size=hparams.encoder_kernel_size,
                         stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'),
                nn.BatchNorm1d(hparams.encoder_embedding_dim))
            convolutions.append(conv_layer)

        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
Ejemplo n.º 5
0
    def __init__(self, hparams):
        super(Encoder, self).__init__()
        self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim
        self.encoder_concat_speaker_embed = hparams.encoder_concat_speaker_embed
        if self.encoder_concat_speaker_embed == 'before':
            self.conv_dim = hparams.encoder_embedding_dim
        elif self.encoder_concat_speaker_embed == 'inside':
            self.conv_dim = hparams.symbols_embedding_dim
        else:
            print(
                f'encoder_concat_speaker_embed is has invalid value {hparams.encoder_concat_speaker_embed}, valid values are "before","inside".'
            )
            raise

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm(self.conv_dim,
                         self.conv_dim,
                         kernel_size=hparams.encoder_kernel_size,
                         stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(self.conv_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
        self.LReLU = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU
    def __init__(self, hparams):
        super(NeuralConcatenativeSpeechSynthesis, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.audio_prenet = Prenet(hparams.n_mel_channels,
                                   [hparams.prenet_dim, hparams.prenet_dim],
                                   hparams)
        self.target_audio_prenet = TargetPrenet(
            hparams.n_mel_channels, [hparams.prenet_dim, hparams.prenet_dim])
        self.text_prenet = ConvNorm(hparams.symbols_embedding_dim,
                                    hparams.symbols_embedding_dim,
                                    kernel_size=hparams.decoder_kernel_size,
                                    stride=hparams.text_stride)

        self.embedding = nn.Embedding(hparams.n_symbols,
                                      hparams.symbols_embedding_dim)
        # Text to audio seq2seq(alignment 1 module)
        self.glued_mel_encoder = AudioEncoder(hparams.prenet_dim,
                                              hparams.encoder_rnn_dim)
        self.glued_text_decoder = AttentionDecoder(
            hparams.symbols_embedding_dim, hparams.decoder_rnn_dim,
            hparams.encoder_rnn_dim)
        # Text to text seq2seq(Pseudo alignment 2)
        self.target_text_decoder = AttentionDecoder(
            hparams.symbols_embedding_dim, hparams.decoder_rnn_dim,
            hparams.decoder_rnn_dim)
        # Decoder
        self.decoder = RecurrentDecoder(hparams.prenet_dim,
                                        hparams.mel_decoder_rnn_dim,
                                        hparams.prenet_dim,
                                        hparams.n_mel_channels, hparams)
        self.postnet = LinearNorm(hparams.prenet_dim, hparams.n_mel_channels)
    def __init__(self, hparams):
        super(Encoder, self).__init__()

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            conv_layer = nn.Sequential(
                ConvNorm(hparams.encoder_embedding_dim,
                         hparams.encoder_embedding_dim,
                         kernel_size=hparams.encoder_kernel_size, stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(hparams.encoder_embedding_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
                            int(hparams.encoder_embedding_dim / 2), 1,
                            batch_first=True, bidirectional=True)

        # Transformer-TTS
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024, 512, padding_idx=0),
                                                    freeze=True)
        self.pos_dropout = nn.Dropout(p=0.1)
        self.alpha = nn.Parameter(torch.ones(1))
        self.layers = clones(SelfAttention(hparams.encoder_embedding_dim), hparams.n_attention)
        self.ffns = clones(FFN(hparams.encoder_embedding_dim), hparams.n_attention)
        self.norm = nn.LayerNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.concat_after = LinearNorm(hparams.encoder_embedding_dim + hparams.encoder_embedding_dim,
                                       hparams.encoder_embedding_dim)
        self.linear_norm = LinearNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
        self.pos_linear = Linear(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim)
Ejemplo n.º 8
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding, bias=False)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=False, w_init_gain='tanh')
Ejemplo n.º 9
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim \
                                     + hparams.emotion_embedding_dim + hparams.speaker_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing

        self.prenet_f0 = ConvNorm(1,
                                  hparams.prenet_f0_dim,
                                  kernel_size=hparams.prenet_f0_kernel_size,
                                  padding=max(
                                      0,
                                      int(hparams.prenet_f0_kernel_size / 2)),
                                  bias=False,
                                  stride=1,
                                  dilation=1)

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + self.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + self.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     self.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Ejemplo n.º 10
0
    def __init__(self, config):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(config["n_mel_channels"],
                         config["postnet_embedding_dim"],
                         kernel_size=config["postnet_kernel_size"],
                         stride=1,
                         padding=int((config["postnet_kernel_size"] - 1) / 2),
                         dilation=1,
                         w_init_gain='tanh'),
                nn.BatchNorm1d(config["postnet_embedding_dim"])))

        for i in range(1, config["postnet_n_convolutions"] - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(config["postnet_embedding_dim"],
                             config["postnet_embedding_dim"],
                             kernel_size=config["postnet_kernel_size"],
                             stride=1,
                             padding=int(
                                 (config["postnet_kernel_size"] - 1) / 2),
                             dilation=1,
                             w_init_gain='tanh'),
                    nn.BatchNorm1d(config["postnet_embedding_dim"])))

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(config["postnet_embedding_dim"],
                         config["n_mel_channels"],
                         kernel_size=config["postnet_kernel_size"],
                         stride=1,
                         padding=int((config["postnet_kernel_size"] - 1) / 2),
                         dilation=1,
                         w_init_gain='linear'),
                nn.BatchNorm1d(config["n_mel_channels"])))
    def __init__(self, in_dim, sizes, hparams):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])

        self.convolutions = nn.Sequential(
            ConvNorm(hparams.prenet_dim,
                     hparams.prenet_dim,
                     kernel_size=hparams.audio_kernel_size,
                     stride=hparams.audio_stride,
                     w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
Ejemplo n.º 12
0
    def __init__(self, hparams):
        super(Encoder, self).__init__()
        self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim
        if self.encoder_speaker_embed_dim:
            self.encoder_speaker_embedding = nn.Embedding(
                hparams.n_speakers, self.encoder_speaker_embed_dim)

        self.encoder_concat_speaker_embed = hparams.encoder_concat_speaker_embed
        self.encoder_conv_hidden_dim = hparams.encoder_conv_hidden_dim

        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
            if _ == 0:
                if self.encoder_concat_speaker_embed == 'before_conv':
                    input_dim = hparams.symbols_embedding_dim + self.encoder_speaker_embed_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    input_dim = hparams.symbols_embedding_dim
                else:
                    raise NotImplementedError(
                        f'encoder_concat_speaker_embed is has invalid value {hparams.encoder_concat_speaker_embed}, valid values are "before","inside".'
                    )
            else:
                input_dim = self.encoder_conv_hidden_dim

            if _ == (hparams.encoder_n_convolutions) - 1:  # last conv
                if self.encoder_concat_speaker_embed == 'before_conv':
                    output_dim = hparams.encoder_LSTM_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    output_dim = hparams.encoder_LSTM_dim - self.encoder_speaker_embed_dim
            else:
                output_dim = self.encoder_conv_hidden_dim

            conv_layer = nn.Sequential(
                ConvNorm(input_dim,
                         output_dim,
                         kernel_size=hparams.encoder_kernel_size,
                         stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(output_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_LSTM_dim,
                            int(hparams.encoder_LSTM_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
        self.LReLU = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU
Ejemplo n.º 13
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()  ##每个类的__init__都要加这一句?
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(
         2,
         attention_n_filters,  ##ConvNorm是layers.py里定义的一个类
         kernel_size=attention_kernel_size,
         padding=padding,
         bias=False,
         stride=1,
         dilation=1)
     self.location_dense = LinearNorm(
         attention_n_filters,
         attention_dim,  ##LinearNorm是layers.py里定义的一个类
         bias=False,
         w_init_gain='tanh')
Ejemplo n.º 14
0
 def __init__(self,
              embed_dim=512 // 2,
              pre_layers=3,
              kernel_size=5,
              dropout=0.5):
     super().__init__()
     self.conv = nn.ModuleList()
     self.dropout = nn.Dropout(dropout)
     for i in range(pre_layers):
         self.conv.append(
             nn.Sequential(
                 ConvNorm(embed_dim,
                          embed_dim,
                          kernel_size=kernel_size,
                          stride=1,
                          padding=int((kernel_size - 1) / 2),
                          dilation=1,
                          w_init_gain='relu'), nn.BatchNorm1d(embed_dim)))
Ejemplo n.º 15
0
    def __init__(self, config):
        super(Encoder, self).__init__()

        convolutions = []
        for _ in range(config["encoder_n_convolutions"]):
            conv_layer = nn.Sequential(
                ConvNorm(config["encoder_embedding_dim"],
                         config["encoder_embedding_dim"],
                         kernel_size=config["encoder_kernel_size"],
                         stride=1,
                         padding=int((config["encoder_kernel_size"] - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'),
                nn.BatchNorm1d(config["encoder_embedding_dim"]))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(config["encoder_embedding_dim"],
                            int(config["encoder_embedding_dim"] / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
Ejemplo n.º 16
0
    def __init__(self, hparams):
        super(Structure_CNN, self).__init__()

        # V = args.embed_num
        D = 1401
        # C = args.class_num
        C = 512
        Ci = 1
        Co = 100
        Ks = [3, 5, 7]

        convolutions = []
        for K in Ks:
            conv_layer = nn.Sequential(
                # ConvNorm(D, Co, kernel_size=K, stride=1, padding=int((K - 1) / 2), dilation=1, w_init_gain='relu'),
                ConvNorm(D,
                         Co,
                         kernel_size=K,
                         stride=1,
                         dilation=1,
                         w_init_gain='relu'),
                nn.BatchNorm1d(Co))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)
Ejemplo n.º 17
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.mellotron = hparams.mellotron
        self.disable_f0 = hparams.disable_f0
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_dim = hparams.encoder_LSTM_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.prenet_layers = hparams.prenet_layers
        self.prenet_batchnorm = hparams.prenet_batchnorm
        self.p_prenet_dropout = hparams.p_prenet_dropout
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.AttRNN_extra_decoder_input = hparams.AttRNN_extra_decoder_input
        self.AttRNN_hidden_dropout_type = hparams.AttRNN_hidden_dropout_type
        self.p_AttRNN_hidden_dropout = hparams.p_AttRNN_hidden_dropout
        self.p_AttRNN_cell_dropout = hparams.p_AttRNN_cell_dropout
        self.DecRNN_hidden_dropout_type = hparams.DecRNN_hidden_dropout_type
        self.p_DecRNN_hidden_dropout = hparams.p_DecRNN_hidden_dropout
        self.p_DecRNN_cell_dropout = hparams.p_DecRNN_cell_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing
        self.teacher_force_till = hparams.teacher_force_till
        self.num_att_mixtures = hparams.num_att_mixtures
        self.extra_projection = hparams.extra_projection
        self.normalize_attention_input = hparams.normalize_attention_input
        self.normalize_AttRNN_output = hparams.normalize_AttRNN_output
        self.attention_type = hparams.attention_type
        self.attention_layers = hparams.attention_layers
        self.low_vram_inference = hparams.low_vram_inference
        self.context_frames = hparams.context_frames
        self.hide_startstop_tokens = hparams.hide_startstop_tokens

        attention_rnn_in_dim = hparams.prenet_dim + self.encoder_dim
        if not self.disable_f0:
            self.prenet_f0 = ConvNorm(
                1,
                hparams.prenet_f0_dim,
                kernel_size=hparams.prenet_f0_kernel_size,
                padding=max(0, int(hparams.prenet_f0_kernel_size / 2)),
                bias=False,
                stride=1,
                dilation=1)
            attention_rnn_in_dim += hparams.prenet_f0_dim

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step *
            self.context_frames, [hparams.prenet_dim] * hparams.prenet_layers,
            self.p_prenet_dropout, self.prenet_batchnorm)

        if self.AttRNN_extra_decoder_input:
            attention_rnn_in_dim += hparams.decoder_rnn_dim

        if self.AttRNN_hidden_dropout_type == 'dropout':
            self.attention_rnn = nn.LSTMCell(
                attention_rnn_in_dim,  # input_size
                hparams.attention_rnn_dim)  # hidden_size)
        elif self.AttRNN_hidden_dropout_type == 'zoneout':
            self.attention_rnn = LSTMCellWithZoneout(
                attention_rnn_in_dim,  # input_size
                hparams.attention_rnn_dim,
                zoneout_prob=self.p_DecRNN_hidden_dropout
            )  # hidden_size, zoneout)
            self.p_AttRNN_hidden_dropout = 0.0  # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout

        if self.attention_type == 0:
            self.attention_layer = Attention(
                hparams.attention_rnn_dim, self.encoder_dim,
                hparams.attention_dim, hparams.attention_location_n_filters,
                hparams.attention_location_kernel_size)
        elif self.attention_type == 1:
            self.attention_layer = GMMAttention(
                hparams.num_att_mixtures, hparams.attention_layers,
                hparams.attention_rnn_dim, self.encoder_dim,
                hparams.attention_dim, hparams.attention_location_n_filters,
                hparams.attention_location_kernel_size, hparams)
        else:
            raise NotImplementedException(
                "attention_type invalid, valid values are... 0 and 1")

        if self.DecRNN_hidden_dropout_type == 'dropout':
            self.decoder_rnn = nn.LSTMCell(
                hparams.attention_rnn_dim + self.encoder_dim,  # input_size
                hparams.decoder_rnn_dim,
                1)  # hidden_size, bias)
        elif self.DecRNN_hidden_dropout_type == 'zoneout':
            self.decoder_rnn = LSTMCellWithZoneout(
                hparams.attention_rnn_dim + self.encoder_dim,  # input_size
                hparams.decoder_rnn_dim,
                1,
                zoneout_prob=self.p_DecRNN_hidden_dropout
            )  # hidden_size, zoneout)
            self.p_DecRNN_hidden_dropout = 0.0  # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout

        if self.extra_projection:
            self.linear_projection_pre = LinearNorm(
                hparams.decoder_rnn_dim + self.encoder_dim,
                hparams.decoder_rnn_dim + self.encoder_dim)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     self.encoder_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')