def __init__(self, tacotron_hyperparams):
     super(DoubleDecoderConsistency, self).__init__()
     self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
     self.n_frames_per_step_ddc = tacotron_hyperparams[
         'number_frames_step_ddc']  # MAIN DIFFERENCE WITH DEFAULT DECODER
     self.encoder_embedding_dim = tacotron_hyperparams[
         'encoder_embedding_dim']
     self.attention_rnn_dim = tacotron_hyperparams[
         'attention_rnn_dim']  # 1024
     self.decoder_rnn_dim = tacotron_hyperparams['decoder_rnn_dim']  # 1024
     self.prenet_dim = tacotron_hyperparams['prenet_dim']
     self.max_decoder_steps = tacotron_hyperparams['max_decoder_steps']
     # The threshold to decide whether stop or not stop decoding?
     self.gate_threshold = tacotron_hyperparams['gate_threshold']
     self.p_attention_dropout = tacotron_hyperparams['p_attention_dropout']
     self.p_decoder_dropout = tacotron_hyperparams['p_decoder_dropout']
     # Define the prenet: there is only one frame per step, so input dim is the number of mel channels.
     # There are two fully connected layers:
     self.prenet = Prenet_dropout(
         tacotron_hyperparams['n_mel_channels'] *
         tacotron_hyperparams['number_frames_step_ddc'], [
             tacotron_hyperparams['prenet_dim'],
             tacotron_hyperparams['prenet_dim']
         ])
     # input_size: 1024 + 512 (output of first LSTM cell + attention_context) / hidden_size: 1024
     self.attention_rnn = nn.LSTMCell(
         tacotron_hyperparams['prenet_dim'] +
         tacotron_hyperparams['encoder_embedding_dim'],
         tacotron_hyperparams['attention_rnn_dim'])
     # return attention_weights and attention_context. Does the alignments.
     self.attention_layer = AttentionNet(
         tacotron_hyperparams['attention_rnn_dim'],
         tacotron_hyperparams['encoder_embedding_dim'],
         tacotron_hyperparams['attention_dim'],
         tacotron_hyperparams['attention_location_n_filters'],
         tacotron_hyperparams['attention_location_kernel_size'])
     # input_size: 256 + 512 (attention_context + prenet_info), hidden_size: 1024
     self.decoder_rnn = nn.LSTMCell(
         tacotron_hyperparams['attention_rnn_dim'] +
         tacotron_hyperparams['encoder_embedding_dim'],
         tacotron_hyperparams['decoder_rnn_dim'], 1)
     # (LSTM output)1024 + (attention_context)512, out_dim: number of mel channels. Last linear projection that
     # generates an output decoder spectral frame.
     self.linear_projection = linear_module(
         tacotron_hyperparams['decoder_rnn_dim'] +
         tacotron_hyperparams['encoder_embedding_dim'],
         tacotron_hyperparams['n_mel_channels'] *
         tacotron_hyperparams['number_frames_step_ddc'])
     # decision whether to continue decoding.
     self.gate_layer = linear_module(
         tacotron_hyperparams['decoder_rnn_dim'] +
         tacotron_hyperparams['encoder_embedding_dim'],
         1,
         bias=True,
         w_init_gain='sigmoid')
Exemple #2
0
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(AttentionNet, self).__init__()
     self.query_layer = linear_module(attention_rnn_dim,
                                      attention_dim,
                                      bias=False,
                                      w_init_gain='tanh')
     # Projecting inputs into 128-D hidden representation
     self.memory_layer = linear_module(embedding_dim,
                                       attention_dim,
                                       bias=False,
                                       w_init_gain='tanh')
     # Projecting into 1-D scalar value
     self.v = linear_module(attention_dim, 1, bias=False)
     # Convolutional layers to obtain location features and projecting them into 128-D hidden representation
     self.location_layer = location_layer(attention_location_n_filters,
                                          attention_location_kernel_size,
                                          attention_dim)
     self.score_mask_value = -float("inf")
Exemple #3
0
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [
         in_dim
     ] + sizes[:
               -1]  # all list values but the last one. The result is a list of the in_dim element
     # concatenated with sizes of layers (i.e. [80, 256])
     self.layers = nn.ModuleList([
         linear_module(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])