def __init__(self, tacotron_hyperparams): super(DoubleDecoderConsistency, self).__init__() self.n_mel_channels = tacotron_hyperparams['n_mel_channels'] self.n_frames_per_step_ddc = tacotron_hyperparams[ 'number_frames_step_ddc'] # MAIN DIFFERENCE WITH DEFAULT DECODER self.encoder_embedding_dim = tacotron_hyperparams[ 'encoder_embedding_dim'] self.attention_rnn_dim = tacotron_hyperparams[ 'attention_rnn_dim'] # 1024 self.decoder_rnn_dim = tacotron_hyperparams['decoder_rnn_dim'] # 1024 self.prenet_dim = tacotron_hyperparams['prenet_dim'] self.max_decoder_steps = tacotron_hyperparams['max_decoder_steps'] # The threshold to decide whether stop or not stop decoding? self.gate_threshold = tacotron_hyperparams['gate_threshold'] self.p_attention_dropout = tacotron_hyperparams['p_attention_dropout'] self.p_decoder_dropout = tacotron_hyperparams['p_decoder_dropout'] # Define the prenet: there is only one frame per step, so input dim is the number of mel channels. # There are two fully connected layers: self.prenet = Prenet_dropout( tacotron_hyperparams['n_mel_channels'] * tacotron_hyperparams['number_frames_step_ddc'], [ tacotron_hyperparams['prenet_dim'], tacotron_hyperparams['prenet_dim'] ]) # input_size: 1024 + 512 (output of first LSTM cell + attention_context) / hidden_size: 1024 self.attention_rnn = nn.LSTMCell( tacotron_hyperparams['prenet_dim'] + tacotron_hyperparams['encoder_embedding_dim'], tacotron_hyperparams['attention_rnn_dim']) # return attention_weights and attention_context. Does the alignments. self.attention_layer = AttentionNet( tacotron_hyperparams['attention_rnn_dim'], tacotron_hyperparams['encoder_embedding_dim'], tacotron_hyperparams['attention_dim'], tacotron_hyperparams['attention_location_n_filters'], tacotron_hyperparams['attention_location_kernel_size']) # input_size: 256 + 512 (attention_context + prenet_info), hidden_size: 1024 self.decoder_rnn = nn.LSTMCell( tacotron_hyperparams['attention_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], tacotron_hyperparams['decoder_rnn_dim'], 1) # (LSTM output)1024 + (attention_context)512, out_dim: number of mel channels. Last linear projection that # generates an output decoder spectral frame. self.linear_projection = linear_module( tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], tacotron_hyperparams['n_mel_channels'] * tacotron_hyperparams['number_frames_step_ddc']) # decision whether to continue decoding. self.gate_layer = linear_module( tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], 1, bias=True, w_init_gain='sigmoid')
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(AttentionNet, self).__init__() self.query_layer = linear_module(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') # Projecting inputs into 128-D hidden representation self.memory_layer = linear_module(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') # Projecting into 1-D scalar value self.v = linear_module(attention_dim, 1, bias=False) # Convolutional layers to obtain location features and projecting them into 128-D hidden representation self.location_layer = location_layer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf")
def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [ in_dim ] + sizes[: -1] # all list values but the last one. The result is a list of the in_dim element # concatenated with sizes of layers (i.e. [80, 256]) self.layers = nn.ModuleList([ linear_module(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes) ])