def inference_decode(enc_outputs, seq_len, embeddings, out_dim): tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, hp.beam_width) tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width) beam_batch_size = tf.shape(tiled_enc_outputs)[0] # start tokens, end token start_tokens = tf.tile([hp.START_TOKEN], [beam_batch_size // hp.beam_width]) end_token = hp.END_TOKEN dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size), is_training=False, prenet_sizes=hp.embed_size, dropout_prob=hp.dropout) attention_mechanism = BahdanauAttention( hp.embed_size, tiled_enc_outputs, normalize=True, memory_sequence_length=tiled_seq_len, probability_fn=tf.nn.softmax) attn_cell = AttentionWrapper(dec_prenet_outputs, attention_mechanism, alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attn_cell) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.embed_size), ResidualWrapper(GRUCell(hp.embed_size)), ResidualWrapper(GRUCell(hp.embed_size)) ], state_is_tuple=True) output_cell = OutputProjectionWrapper(decoder_cell, out_dim) initial_state = output_cell.zero_state(batch_size=beam_batch_size, dtype=tf.float32) decoder = BeamSearchDecoder(cell=output_cell, embedding=embeddings, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state, beam_width=hp.beam_width) outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=hp.max_len) return outputs
def training_decode(enc_outputs, seq_len, helper, out_dim): dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size), is_training=True, prenet_sizes=hp.embed_size, dropout_prob=hp.dropout) attention_mechanism = BahdanauAttention(hp.embed_size, enc_outputs, normalize=True, memory_sequence_length=seq_len, probability_fn=tf.nn.softmax) attn_cell = AttentionWrapper(dec_prenet_outputs, attention_mechanism, alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attn_cell) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.embed_size), ResidualWrapper(GRUCell(hp.embed_size)), ResidualWrapper(GRUCell(hp.embed_size)) ], state_is_tuple=True) output_cell = OutputProjectionWrapper(decoder_cell, out_dim) initial_state = output_cell.zero_state(batch_size=tf.shape(enc_outputs)[0], dtype=tf.float32) decoder = BasicDecoder(cell=output_cell, helper=helper, initial_state=initial_state) (outputs, _), last_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=hp.max_len) # for attention plot alignments = tf.transpose(last_state[0].alignment_history.stack(), [1, 2, 0]) return outputs, alignments
def initialize(self, inputs, input_lengths, mel_targets_pos=None, linear_targets_pos=None, mel_targets_neg=None, linear_targets_neg=None, labels_pos=None, labels_neg=None, reference_mel_pos=None, reference_mel_neg=None): is_training = linear_targets_pos is not None is_teacher_force_generating = mel_targets_pos is not None batch_size = tf.shape(inputs)[0] hp = self._hparams ## Text Encoding scope with tf.variable_scope('text_encoder', reuse=tf.AUTO_REUSE) as scope: # Initialize Text Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Text Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] content_inputs = encoder_outputs ## Reference Encoding Scope with tf.variable_scope('audio_encoder', reuse=tf.AUTO_REUSE) as scope: if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens if is_training: reference_mel_pos = mel_targets_pos reference_mel_neg = mel_targets_neg if reference_mel_pos is not None: # Reference encoder refnet_outputs_pos = reference_encoder( reference_mel_pos, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(128), is_training=is_training) # [n, 128] self.refnet_outputs_pos = refnet_outputs_pos refnet_outputs_neg = reference_encoder( reference_mel_neg, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(128), is_training=is_training) # [n, 128] self.refnet_outputs_neg = refnet_outputs_neg # Extract style features ref_style = style_encoder(reference_mel_neg, filters=[32, 32, 64, 64], kernel_size=(3, 3), strides=(2, 2), is_training=False) self.ref_style = ref_style if hp.use_gst: # Multi-head attention style_attention_pos = MultiheadAttention( tf.tanh(tf.expand_dims(refnet_outputs_pos, axis=1)), # [N, 1, 128] tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ]), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=128, attention_type=hp.style_att_type) style_attention_neg = MultiheadAttention( tf.tanh(tf.expand_dims(refnet_outputs_neg, axis=1)), # [N, 1, 128] tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ]), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=128, attention_type=hp.style_att_type) # Apply tanh to compress both encoder state and style embedding to the same scale. style_embeddings_pos = style_attention_pos.multi_head_attention( ) # [N, 1, 256] style_embeddings_neg = style_attention_neg.multi_head_attention( ) # [N, 1, 256] else: style_embeddings_pos = tf.expand_dims( refnet_outputs_pos, axis=1) # [N, 1, 128] style_embeddings_neg = tf.expand_dims(refnet_outputs_neg, axis=1) else: print("Use random weight for GST.") # Add style embedding to every text encoder state ## tile style embeddings such that it could matched with text sequence shape, ## format: _content_style style_embeddings_pos = tf.tile( style_embeddings_pos, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] style_embeddings_neg = tf.tile( style_embeddings_neg, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] ## purmute four encoder outputs, e.g. pos2pos is positive content wieh positive style, pos2neg is postive content wity ## negtive style. encoder_outputs_pos = tf.concat( [encoder_outputs, style_embeddings_pos], axis=-1) encoder_outputs_neg = tf.concat( [encoder_outputs, style_embeddings_neg], axis=-1) # Decoding scope with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope: # RNN Attention attention_cell_pos = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs_pos, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] attention_cell_neg = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs_neg, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell_pos = ConcatOutputAndAttentionWrapper( attention_cell_pos) concat_cell_neg = ConcatOutputAndAttentionWrapper( attention_cell_neg) # Decoder (layers specified bottom to top): decoder_cell_pos = MultiRNNCell( [ OutputProjectionWrapper(concat_cell_pos, 256), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] decoder_cell_neg = MultiRNNCell( [ OutputProjectionWrapper(concat_cell_neg, 256), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell_pos = OutputProjectionWrapper( decoder_cell_pos, hp.num_mels * hp.outputs_per_step) decoder_init_state_pos = output_cell_pos.zero_state( batch_size=batch_size, dtype=tf.float32) output_cell_neg = OutputProjectionWrapper( decoder_cell_neg, hp.num_mels * hp.outputs_per_step) decoder_init_state_neg = output_cell_neg.zero_state( batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper_pos = TacoTrainingHelper(inputs, mel_targets_pos, hp.num_mels, hp.outputs_per_step) helper_neg = TacoTrainingHelper(inputs, mel_targets_neg, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs_pos, _ ), final_decoder_state_pos, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell_pos, helper_pos, decoder_init_state_pos), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] (decoder_outputs_neg, _ ), final_decoder_state_neg, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell_neg, helper_neg, decoder_init_state_neg), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs_pos = tf.reshape( decoder_outputs_pos, [batch_size, -1, hp.num_mels]) # [N, T_out, M] mel_outputs_neg = tf.reshape( decoder_outputs_neg, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs_pos = post_cbhg(mel_outputs_pos, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs_pos = tf.layers.dense(post_outputs_pos, hp.num_freq) # [N, T_out, F] post_outputs_neg = post_cbhg(mel_outputs_neg, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs_neg = tf.layers.dense(post_outputs_neg, hp.num_freq) # [N, T_out, F] ## Grab alignments from the final decoder state: alignments_pos = tf.transpose( final_decoder_state_pos[0].alignment_history.stack(), [1, 2, 0]) alignments_neg = tf.transpose( final_decoder_state_neg[0].alignment_history.stack(), [1, 2, 0]) # Extract style features for fake sample rec_style = style_encoder(mel_outputs_neg, filters=[32, 32, 64, 64], kernel_size=(3, 3), strides=(2, 2), is_training=False) self.rec_style = rec_style # Discriminator scope with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE) as scope: self.real_logit = discriminator(content_inputs, reference_mel_pos, is_training=is_training) self.fake_logit_pos = discriminator(content_inputs, mel_outputs_pos, is_training=is_training) self.fake_logit_neg = discriminator(content_inputs, mel_outputs_neg, is_training=is_training) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs_pos = mel_outputs_pos self.mel_outputs_neg = mel_outputs_neg self.encoder_outputs = encoder_outputs self.style_embeddings_pos = style_embeddings_pos self.style_embeddings_neg = style_embeddings_neg self.linear_outputs_pos = linear_outputs_pos self.linear_outputs_neg = linear_outputs_neg self.alignments_pos = alignments_pos self.alignments_neg = alignments_neg self.mel_targets_pos = mel_targets_pos self.mel_targets_neg = mel_targets_neg self.linear_targets_pos = linear_targets_pos self.linear_targets_neg = linear_targets_neg self.reference_mel_pos = reference_mel_pos self.reference_mel_neg = reference_mel_neg log('Initialized Tacotron model. Dimensions: ') log('text embedding: %d' % embedded_inputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mels=None): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. reference_mels: the reference encoder inputs """ with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings for character inputs: [N, T_in] embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Whether use Global Style Token if is_training: reference_mels = mel_targets if hp.use_gst: gst_tokens = tf.get_variable( 'style_tokens', [hp.num_tokens, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Reference Encoder _, reference_encoder_outputs = reference_encoder( inputs=reference_mels, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), is_training=is_training) # [N, 128] # Style Token Layer Using Multi-Head Attention style_attention = MultiHeadAttention( num_heads=hp.num_heads, num_units=128, attention_type=hp.attention_type) style_embedding = tf.nn.tanh( style_attention.multi_head_attention( query=tf.expand_dims(reference_encoder_outputs, axis=1), # [N, 1, 128] value=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ]) # [N, num_tokens, 256/num_heads] )) # [N, 1, 128] # add style embedding to encoder outputs T_in = shape_list(encoder_outputs)[1] style_embedding = tf.tile(style_embedding, [1, T_in, 1]) encoder_outputs = tf.concat([encoder_outputs, style_embedding], axis=-1) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top), # fix decoder cell from gru to lstm and add zoneout decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1, is_training)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1, is_training)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels) #Paper doesn't specify what to do with final encoder state #So we will simply drop it encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths, 'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units, zoneout=hp.zoneout_rate) #Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism zoneout_factor_cell=hp.zoneout_rate, #based on original tacotron architecture zoneout_factor_output=hp.zoneout_rate), is_training), LocationSensitiveAttention(hp.attention_dim, encoder_outputs), alignment_history=True, output_attention=False, name='attention_cell') #Concat Prenet output with context vector concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell) #Decoder layers (attention pre-net + 2 unidirectional LSTM Cells) decoder_cell = unidirectional_LSTM(concat_cell, is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate) #Concat LSTM output with context vector concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell) #Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation) output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #We'll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters #initial decoder state decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Decode (decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode( CustomDecoder(output_cell, self.helper, decoder_init_state), impute_finished=True, #Cut out padded parts (enabled) maximum_iterations=max_iterations) # Reshape outputs to be one output per entry decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels]) #Compute residual using post-net residual = postnet(decoder_output, is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels) #Project residual to same dimension as mel spectrogram projected_residual = projection(residual, shape=hp.num_mels, scope='residual_projection') #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape))
def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder # prenet_outputs = prenet(embedded_inputs, is_training) prenet_outputs = prenet(inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if inputs_jp is not None: # Reference encoder refnet_outputs = reference_encoder( inputs_jp, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.inputs_jp = inputs_jp log('Initialized Tacotron model. Dimensions: ') log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "pml_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry decoder_outputs = tf.reshape( multi_decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Postnet: predicts a residual postnet_outputs = postnet(decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) pml_outputs = decoder_outputs + postnet_outputs # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, multi_decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unknown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') # [N, T_in, prenet_depths[-1]=128] encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.dec_rnn_size), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: # [N, T_out, postnet_depth=256] post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets): """ Decoder Prenet -> Attention RNN Postprocessing CBHG @param encoder_outputs outputs from the encoder wtih shape [N, T_in, prenet_depth=256] @param inputs int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs @param is_training flag for training or eval @param batch_size number of samples per batch @param mel_targets float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel @param output_cell attention cell @param decoder_init_state initial state of the decoder @return linear_outputs, mel_outputs and alignments """ if (is_training): helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels, hparams.outputs_per_step) else: helper = TacoTestHelper(batch_size, hparams.num_mels, hparams.outputs_per_step) # Attention attention_cell = AttentionWrapper( GRUCell(hparams.attention_depth), BahdanauAttention(hparams.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hparams.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hparams.decoder_depth), ResidualWrapper(GRUCell(hparams.decoder_depth)), ResidualWrapper(GRUCell(hparams.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hparams.num_mels * hparams.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hparams.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hparams.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hparams.num_mels, is_training, # [N, T_out, postnet_depth=256] hparams.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hparams.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) log('Decoder Network ...') log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hparams.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) return linear_outputs, mel_outputs, alignments
def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. speaker_ids: int32 Tensor containing ids of specific speakers mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference'): is_training = linear_targets is not None batch_size = tf.shape(text_inputs)[0] hp = self._hparams vocab_size = len(symbols) embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim) # [N, T_in, embd_size] # extract speaker embedding if multi-speaker with tf.variable_scope('speaker'): if hp.num_speakers > 1: speaker_embedding = tf.get_variable('speaker_embed', shape=(hp.num_speakers, hp.speaker_embed_dim), dtype=tf.float32) # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)? speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids) else: speaker_embd = None # Encoder prenet_outputs = prenet(inputs=embedded_inputs, drop_rate=hp.drop_rate if is_training else 0.0, is_training=is_training, layer_sizes=hp.encoder_prenet, scope="prenet") # [N, T_in, 128] encoder_outputs = cbhg(prenet_outputs, input_lengths, speaker_embd=speaker_embd, is_training=is_training, K=hp.encoder_cbhg_banks, c=hp.encoder_cbhg_bank_sizes, # [N, T_in, 256] scope='encoder_cbhg') # Attention Mechanism attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training, speaker_embd=speaker_embd, attention_type=hp.attention_type) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(attention_cell, hp.decoder_dim), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)) # 256 ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing post_outputs = cbhg(mel_outputs, None, speaker_embd=None, is_training=is_training, K=hp.post_cbhg_banks, c=hp.post_cbhg_bank_sizes + [hp.num_mels], scope='post_cbhg') # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = text_inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.audio = audio.inv_spectrogram_tensorflow(linear_outputs) self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) # TODO: later work around for getting info back? # log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % attention_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def decode(self, encoder_outputs, batch_size): # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(self._hparams.get('attention_depth')), self._is_training, self._hparams.get('prenet_depths')), BahdanauAttention(self._hparams.get('attention_depth'), encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, self._hparams.get('decoder_depth')), ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))), ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, self._hparams.get('num_mels') * self._hparams.get('outputs_per_step')) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = dynamic_decode( BasicDecoder(output_cell, self._helper, decoder_init_state), maximum_iterations=self._hparams.get( 'max_iters')) # [N, T_out/r, M*r] mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, self._hparams.get('num_mels')]) # Post processing CHBG kwargs = { 'K': self._hparams.get('decoder_K'), 'bank_num_filters': self._hparams.get('decoder_bank_num_filters'), 'pooling_stride': self._hparams.get('decoder_pooling_stride'), 'pooling_width': self._hparams.get('decoder_pooling_width'), 'proj_num_filters': self._hparams.get('decoder_proj_num_filters'), 'proj_filter_width': self._hparams.get('decoder_proj_filter_width'), 'num_highway_layers': self._hparams.get('decoder_num_highway_layers'), 'highway_depth': self._hparams.get('decoder_highway_depth'), 'gru_num_cells': self._hparams.get('decoder_gru_num_cells') } post_out = cbhg(mel_outputs, None, self._is_training, 'post_cbhg', **kwargs) lin_outputs = tf.layers.dense(post_out, self._hparams.get('num_freq')) return mel_outputs, lin_outputs, final_decoder_state
def attention_decoder(inputs, memory, num_units=None, is_training=True, alignment_history=True, scope="attention_decoder", reuse=None): '''Applies a GRU to `inputs`, while attending `memory`. Args: inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs. num_units: An int. Attention size. memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: Tuple: A 3d tensor with shape of [N, T, num_units]. AttentionWrapper final state. ''' with tf.variable_scope(scope, reuse=reuse): if num_units is None: num_units = inputs.get_shape().as_list()[-1] batch_size = inputs.get_shape().as_list()[0] attention_mecanism = tf.contrib.seq2seq.BahdanauAttention( num_units, memory) decoder_cell = DecoderPrenetWrapper(GRUCell(num_units), is_training) cell_with_attention = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mecanism, num_units, alignment_history=alignment_history, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(cell_with_attention) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, num_units), ResidualWrapper(GRUCell(num_units)), ResidualWrapper(GRUCell(num_units)) ], state_is_tuple=True) # Outputs => (N, T', hp.n_mels*hp.r) out_dim = inputs.get_shape().as_list()[-1] output_cell = OutputProjectionWrapper(decoder_cell, out_dim) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(memory, inputs, hp.n_mels * hp.r, hp.r) else: helper = TacoTesthelper(batch_size, hp.n_mels * hp.r, hp.r) (decoder_outputs, _), final_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry # mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.n_mels]) # [N, T_out, M] return decoder_outputs, final_state[0]
class Tacotron(): def __init__(self, hparams): self._hparams = hparams def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None self.batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): self.output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) self.decoder_init_state = self.output_cell.zero_state( batch_size=self.batch_size, dtype=tf.float32) if is_training: self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: self.helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(self.output_cell, self.helper, self.decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [self.batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) def update(self, hparams): with tf.variable_scope('inference') as scope: self._hparams = hparams hp = self._hparams (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(self.output_cell, self.helper, self.decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [self.batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training=False, is_updating=True) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq, reuse=True) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments log('Updated Tacotron model.') def add_loss(self): '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' with tf.variable_scope('loss') as scope: hp = self._hparams self.mel_loss = tf.reduce_mean( tf.abs(self.mel_targets - self.mel_outputs)) l1 = tf.abs(self.linear_targets - self.linear_outputs) # Prioritize loss for frequencies under 3000 Hz. n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq) self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean( l1[:, :, 0:n_priority_freq]) self.loss = self.mel_loss + self.linear_loss def add_optimizer(self, global_step): '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. Args: global_step: int32 scalar Tensor representing current global step in training ''' with tf.variable_scope('optimizer') as scope: hp = self._hparams if hp.decay_learning_rate: self.learning_rate = _learning_rate_decay( hp.initial_learning_rate, global_step) else: self.learning_rate = tf.convert_to_tensor( hp.initial_learning_rate) optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients = gradients clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients( zip(clipped_gradients, variables), global_step=global_step)
def __init__(self, hparams, is_training=False, with_target=True, reuse=False): self.with_target = with_target self.hparams = hparams self.is_training = is_training self.inputs = tf.placeholder(tf.int32, (None, None), name='graphemes_ph') self.input_lengths = tf.placeholder(tf.int32, [None], name='grapeheme_seq_len_ph') if with_target: self.targets = tf.placeholder(tf.int32, (None, None), name='phonemes_ph') self.target_lengths = tf.placeholder(tf.int32, [None], name='phoneme_seq_len_ph') with tf.variable_scope('g2p', reuse=reuse): embedding_table = tf.get_variable( 'embedding', [hparams.graphemes_num, hparams.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) outputs = tf.nn.embedding_lookup(embedding_table, self.inputs) if hparams.with_conv: for i in range(hparams.conv_num): outputs = conv1d(outputs, hparams.conv_width, hparams.conv_channels, tf.nn.relu, is_training, hparams.dropout_rate, 'conv_%d' % i) forward_cell = rnn_cell(hparams.encoder_lstm_units // 2, hparams, is_training) backward_cell = rnn_cell(hparams.encoder_lstm_units // 2, hparams, is_training) outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn( forward_cell, backward_cell, outputs, sequence_length=self.input_lengths, dtype=tf.float32, scope='bilstm') # Concatentate forward and backwards: encoder_outputs = tf.concat(outputs, axis=2) decoder_cell = MultiRNNCell([ rnn_cell(hparams.decoder_lstm_units, hparams, is_training), rnn_cell(hparams.decoder_lstm_units, hparams, is_training) ], state_is_tuple=True) decoder_embeddings = tf.get_variable( name='decoder_embeddings', shape=[hparams.phonemes_num, hparams.decoder_embedding_dim], dtype=tf.float32) if is_training: batch_size = tf.shape(self.inputs)[0] attention_cell = self.create_attention_cell( hparams.attention_depth, encoder_outputs, self.input_lengths, decoder_cell, alignment_history=False) attention_cell = OutputProjectionWrapper( attention_cell, hparams.phonemes_num) targets_shifted = self.targets[:, :-1] targets_emb = tf.nn.embedding_lookup(decoder_embeddings, targets_shifted) helper = tf.contrib.seq2seq.TrainingHelper( inputs=targets_emb, sequence_length=self.target_lengths) #decoder_initial_state = attention_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state) decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) decoder = tf.contrib.seq2seq.BasicDecoder( attention_cell, helper, decoder_initial_state) outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder) self.decoded_best = tf.identity(outputs.sample_id, name='predicted_1best') self.logits = outputs.rnn_output self.probs = tf.nn.softmax(self.logits, name='probs') else: if self.hparams.beam_width == 1: batch_size = tf.shape(self.inputs)[0] attention_cell = self.create_attention_cell( hparams.attention_depth, encoder_outputs, self.input_lengths, decoder_cell, alignment_history=False) attention_cell = OutputProjectionWrapper( attention_cell, hparams.phonemes_num) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=decoder_embeddings, start_tokens=tf.fill([batch_size], hparams.phonemes_num - 2), end_token=hparams.phonemes_num - 1) decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) decoder = tf.contrib.seq2seq.BasicDecoder( attention_cell, helper, decoder_initial_state) outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=self.hparams.max_phoneme_seq_len) self.decoded_best = tf.identity(outputs.sample_id, name='predicted_1best') self.logits = outputs.rnn_output self.probs = tf.nn.softmax(self.logits, name='probs') else: batch_size = tf.shape(self.inputs)[0] start_tokens = tf.fill([batch_size], hparams.phonemes_num - 2) batch_size = batch_size * hparams.beam_width encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=hparams.beam_width) input_lengths_tile = tf.contrib.seq2seq.tile_batch( self.input_lengths, multiplier=hparams.beam_width) encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) attention_cell = self.create_attention_cell( hparams.attention_depth, encoder_outputs, input_lengths_tile, decoder_cell, alignment_history=False) attention_cell = OutputProjectionWrapper( attention_cell, hparams.phonemes_num) decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=attention_cell, embedding=decoder_embeddings, start_tokens=start_tokens, end_token=hparams.phonemes_num - 1, initial_state=decoder_initial_state, beam_width=hparams.beam_width, output_layer=None, length_penalty_weight=hparams.length_penalty) outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=hparams.max_iters) self.logits = tf.no_op() print( '**Warning! You could not be able to build lattice with beam_width > 1' ) self.probs = tf.no_op() # best beam self.decoded_best = tf.identity(outputs.predicted_ids[:, :, 0], name='predicted_1best')
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, eal=False, locked_alignments=None, logs_enabled=True, flag_trainAlign=False, flag_trainJoint=False, alignScale=1.0): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments self.flag_trainAlign = flag_trainAlign self.flag_trainJoint = flag_trainJoint self.alignScale = alignScale if locked_alignments_ is not None: if is_training and eal: pass elif np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: if gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif eal: helper = TacoTrainingHelper_EAL(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: log('For training, one of these should be true: gta, eal, hp.scheduled_sampling' ) else: if gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif eal: helper = TacoTrainingHelper_EAL(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense(post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell self.locked_alignments = locked_alignments_ if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(gta)) log(' EAL mode: {}'.format(eal)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Prenet out: {}'.format( prenet_outputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Postnet out: {}'.format( post_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, vgg19_model_path, mel_targets=None, linear_targets=None): """Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs vgg19_model_path: File path to the npy file containing pretrained weights of the VGG19 model mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. """ with tf.variable_scope('inference') as _: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # VGG19 self.vgg19_pretrained = Vgg19(vgg19_model_path) vgg_output = tf.map_fn(self.__preprocess_before_vgg19, inputs) last_fc_output_size = tf.shape(vgg_output)[1] input_lengths = tf.tile([last_fc_output_size], [batch_size]) # Encoder prenet_outputs = prenet(vgg_output, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, voice, is_training, eval_batch_size=1, write_debug_files=False, voice_path=VOICE_PATH, tf_device=DEFAULT_DEVICE): self.voice = voice self.voice_path = voice_path % voice self.write_debug_files = write_debug_files self.hpfn = '%s/hparams.json' % self.voice_path with codecs.open(self.hpfn, 'r', 'utf8') as hpf: self.hp = json.loads(hpf.read()) self.batch_size = self.hp[ 'batch_size'] if is_training else eval_batch_size max_num_frames = self.hp['max_iters'] * self.hp[ 'outputs_per_step'] * self.hp['frame_shift_ms'] * self.hp[ 'sample_rate'] / 1000 n_fft, hop_length, win_length = audio.stft_parameters(self.hp) self.max_mfc_frames = 1 + int((max_num_frames - n_fft) / hop_length) with tf.device(tf_device): # self.inputs = tf.placeholder(dtype = tf.int32, shape = [None, self.hp['max_inp_len']]) # self.input_lengths = tf.placeholder(dtype = tf.int32, shape = [None]) self.inputs = tf.placeholder( dtype=tf.int32, shape=[self.batch_size, self.hp['max_inp_len']]) self.input_lengths = tf.placeholder(dtype=tf.int32, shape=[self.batch_size]) logging.debug('inputs: %s' % self.inputs) logging.debug('input_lengths: %s' % self.input_lengths) # self.mel_targets = tf.placeholder(tf.float32, [None, self.max_mfc_frames, self.hp['num_mels']], 'mel_targets') # self.linear_targets = tf.placeholder(tf.float32, [None, self.max_mfc_frames, self.hp['num_freq']], 'linear_targets') # self.target_lengths = tf.placeholder(tf.int32, [None], 'target_lengths') self.mel_targets = tf.placeholder( tf.float32, [self.batch_size, self.max_mfc_frames, self.hp['num_mels']], 'mel_targets') self.linear_targets = tf.placeholder( tf.float32, [self.batch_size, self.max_mfc_frames, self.hp['num_freq']], 'linear_targets') self.target_lengths = tf.placeholder(tf.int32, [self.batch_size], 'target_lengths') logging.debug('mel_targets: %s' % self.mel_targets) logging.debug('linear_targets: %s' % self.linear_targets) logging.debug('targets_lengths: %s' % self.target_lengths) # Embeddings embedding_table = tf.get_variable( 'embedding', [len(self.hp['alphabet']), self.hp['embed_depth']], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) logging.debug('embedding_table: %s' % embedding_table) embedded_inputs = tf.nn.embedding_lookup( embedding_table, self.inputs) # [N, max_inp_len, 256] logging.debug('embedded_inputs: %s' % embedded_inputs) # Encoder prenet_outputs = _create_prenet( embedded_inputs, is_training, self.hp['prenet_depths']) # [N, max_inp_len, 128] logging.debug('prenet_outputs: %s' % prenet_outputs) encoder_outputs = _create_encoder_cbhg( prenet_outputs, self.input_lengths, is_training, # [N, max_inp_len, 256] self.hp['encoder_depth']) logging.debug('encoder_outputs: %s' % encoder_outputs) # Attention attention_cell = AttentionWrapper( GRUCell(self.hp['attention_depth']), BahdanauAttention(self.hp['attention_depth'], encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] logging.debug('attention_cell: %s' % attention_cell) # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, self.hp['prenet_depths']) logging.debug('attention_cell: %s' % attention_cell) # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, max_inp_len, 512] logging.debug('concat_cell: %s' % concat_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, max_inp_len, 256] logging.debug('decoder_cell: %s' % decoder_cell) # T_in -> max_inp_len # M -> hp.num_mels # r -> hp.outputs_per_step # mel_targets -> frame_targets # max_iters -> max_iters # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, self.hp['num_mels'] * self.hp['outputs_per_step']) logging.debug('output_cell: %s' % output_cell) decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) logging.debug('decoder_init_state: %s' % repr(decoder_init_state)) if is_training: helper = TacoTrainingHelper(self.inputs, self.mel_targets, self.hp['num_mels'], self.hp['outputs_per_step'], self.target_lengths) else: helper = TacoTestHelper(self.batch_size, self.hp['num_mels'], self.hp['outputs_per_step']) logging.debug('helper: %s' % helper) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=self.hp['max_iters']) # [N, T_out/r, M*r] logging.debug('decoder_outputs: %s' % decoder_outputs) logging.debug('final_decoder_state: %s' % repr(final_decoder_state)) # Reshape outputs to be one output per entry self.mel_outputs = tf.reshape( decoder_outputs, [self.batch_size, -1, self.hp['num_mels']]) # [N, T_out, M] logging.debug('mel_outputs: %s' % self.mel_outputs) # Add post-processing CBHG: post_outputs = _create_post_cbhg( self.mel_outputs, # [N, T_out, postnet_depth=256] self.hp['num_mels'], is_training, self.hp['postnet_depth']) logging.debug('post_outputs: %s' % post_outputs) self.linear_outputs = tf.layers.dense( post_outputs, self.hp['num_freq']) # [N, T_out, F] logging.debug('linear_outputs: %s' % self.linear_outputs) # Grab alignments from the final decoder state: self.alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) logging.debug('alignments: %s' % self.alignments) if is_training: self.global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('loss') as scope: mel_loss = tf.reduce_mean( tf.abs(self.mel_targets - self.mel_outputs)) l1 = tf.abs(self.linear_targets - self.linear_outputs) # Prioritize loss for frequencies under 3000 Hz. n_priority_freq = int(3000 / (self.hp['sample_rate'] * 0.5) * self.hp['num_freq']) linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean( l1[:, :, 0:n_priority_freq]) self.loss = mel_loss + linear_loss with tf.variable_scope('optimizer') as scope: learning_rate = tf.train.exponential_decay( self.hp['initial_learning_rate'], self.global_step, self.hp['learning_rate_decay_halflife'], 0.5) optimizer = tf.train.AdamOptimizer(learning_rate, self.hp['adam_beta1'], self.hp['adam_beta2']) gradients, variables = zip( *optimizer.compute_gradients(self.loss)) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients( zip(clipped_gradients, variables), global_step=self.global_step) self.saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) self.sess = tf.Session() self.cpfn = '%s/cp' % self.voice_path # find latest checkpoint latest_cp = tf.train.latest_checkpoint(self.cpfn) self.epoch_start = 0 if latest_cp: logging.debug('restoring variables from %s ...' % latest_cp) self.saver.restore(self.sess, latest_cp) # extract epoch number from filename self.epoch_start = int( os.path.basename(latest_cp).split('-')[0][2:]) + 1 else: self.cpfn = '%s/model' % self.voice_path if os.path.exists('%s.index' % self.cpfn): logging.debug('restoring variables from %s ...' % self.cpfn) self.saver.restore(self.sess, self.cpfn) else: if is_training: logging.debug( 'couldn\'t restore variables from %s -> initializing fresh training run.' % self.cpfn) self.sess.run(tf.global_variables_initializer()) else: raise Exception("couldn't load model from %s" % self.cpfn)
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None self.batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 512] # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) attention_cell = AttentionWrapper( DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 128] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ concat_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units) ], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry [N, T_out, M] decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels]) # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' expand out: %d' % expand_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Global style tokens (GST), When using h attention heads, we set # the token embedding size to be 256/h and concatenate the attention # outputs of each head. gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: # Reference encoder reference_embedding = reference_encoder( mel_targets, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), is_training=is_training) # Style token layer style_embedding = multi_head_attention( num_heads=hp.num_heads, queries=tf.expand_dims(reference_embedding, axis=1), # [N, 1, 128] memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256 // hp.num_heads] num_units=128) else: # TODO Add support for reference mode and more effective style control during inference. # Randomly select style embedding from gst_tokens for simplicity. random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32) style_embedding = tf.nn.embedding_lookup(gst_tokens, random_index) # Add style embedding to every text encoder state, applying tanh to # compress both encoder state and style embedding to the same scale. encoder_outputs += tf.nn.tanh(style_embedding) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) if is_training: (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state)) else: (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets tf.logging.info('Initialized Tacotron model. Dimensions: ') tf.logging.info(' embedding: %d' % embedded_inputs.shape[-1]) tf.logging.info(' prenet out: %d' % prenet_outputs.shape[-1]) tf.logging.info(' encoder out: %d' % encoder_outputs.shape[-1]) tf.logging.info(' attention out: %d' % attention_cell.output_size) tf.logging.info(' concat attn & out: %d' % concat_cell.output_size) tf.logging.info(' decoder cell out: %d' % decoder_cell.output_size) tf.logging.info(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) tf.logging.info(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) tf.logging.info(' postnet out: %d' % post_outputs.shape[-1]) tf.logging.info(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id=None, mel_targets=None, linear_targets=None, is_training=False, loss_coeff=None, stop_token_targets=None): with tf.variable_scope('Eembedding') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) deep_dense = lambda x, dim, name: tf.layers.dense( x, dim, activation=tf.nn.softsign, name=name ) # softsign: x / (abs(x) + 1) encoder_rnn_init_state = deep_dense( speaker_embed, hp.encoder_lstm_units * 4, 'encoder_init_dense') # hp.encoder_lstm_units = 256 decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.decoder_lstm_units * 2, 'decoder_init_dense_{}'.format(i)) for i in range(hp.decoder_layers) ] # hp.decoder_lstm_units = 1024 speaker_embed = None else: # self.num_speakers =1인 경우 speaker_embed = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None with tf.variable_scope('Encoder') as scope: ############## # Encoder ############## x = char_embedded_inputs for i in range(hp.enc_conv_num_layers): x = tf.layers.conv1d(x, filters=hp.enc_conv_channels, kernel_size=hp.enc_conv_kernel_size, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='dropout_{}'.format(i)) if encoder_rnn_init_state is not None: initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split( encoder_rnn_init_state, 4, 1) initial_state_fw = LSTMStateTuple(initial_state_fw_c, initial_state_fw_h) initial_state_bw = LSTMStateTuple(initial_state_bw_c, initial_state_bw_h) else: # single mode initial_state_fw, initial_state_bw = None, None cell_fw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: ############## # Attention ############## if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) decoder_lstm = [ ZoneoutLSTMCell(hp.decoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='decoder_LSTM_{}'.format(i + 1)) for i in range(hp.decoder_layers) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "multi-speaker": decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx][0].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1[1] * 2 != shape2[1]: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) c, h = tf.split(cell, 2, 1) decoder_init_state[idx] = LSTMStateTuple(c, h) decoder_init_state = tuple(decoder_init_state) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_prenet_outputs = DecoderWrapper(attention_cell, is_training, hp.dec_prenet_sizes, hp.dropout_prob, hp.inference_prenet_dropout) dec_outputs_cell = OutputProjectionWrapper( dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor) if is_training: helper = TacoTrainingHelper( mel_targets, hp.num_mels, hp.reduction_factor) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor)) # max_iters=200 decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor], [batch_size, -1, hp.num_mels ]) # [N,iters,400] -> [N,5*iters,80] stop_token_outputs = tf.reshape( decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(hp.postnet_num_layers): activation = tf.nn.tanh if i != (hp.postnet_num_layers - 1) else None x = tf.layers.conv1d(x, filters=hp.postnet_channels, kernel_size=hp.postnet_kernel_size, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') linear_outputs = tf.layers.dense( post_outputs, hp.num_freq, name='linear_spectogram_projection') # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state self.stop_token_targets = stop_token_targets self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) log(' encoder conv out: %d' % encoder_conv_output.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' decoder prenet lstm concat out : %d' % dec_prenet_outputs.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder mel out: %d' % decoder_mel_outputs.shape[-1]) log(' mel out: %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embedding embedding_table = tf.get_variable( 'embedding', [hp.len_symbols, hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None, logs_enabled=True): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_gru( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, gru_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): cells = [ GRUCell(hp.decoder_gru_units) for _ in range(hp.decoder_gru_layers) ] decoder_cell = MultiRNNCell( [concat_cell] + cells, state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: if hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and GRU layer: expand_outputs = conv_and_gru( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, gru_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None): with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(128), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.tanh(tf.expand_dims(refnet_outputs, axis=1)), # [N, 1, 128] tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ]), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=128, attention_type=hp.style_att_type) # Apply tanh to compress both encoder state and style embedding to the same scale. style_embeddings = style_attention.multi_head_attention( ) # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape( style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # # Grab alignments from the final decoder state: # alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs # self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder encoder_outputs = encoder(embedded_inputs, input_lengths, is_training, 512, 5, 256) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.ref_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.ref_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) embedded_tokens = style_attention.multi_head_attention( ) # [N, 1, 256] else: random_weights = tf.constant( hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] * (hp.num_gst - hp.gst_index)], dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads]) embedded_tokens = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) embedded_tokens = hp.gst_scale * embedded_tokens embedded_tokens = tf.reshape( embedded_tokens, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile( embedded_tokens, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( attention_cell, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) # log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) if hp.use_vae: style_embeddings, mu, log_var = VAE(inputs=mel_targets, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.mel_lengths = mel_lengths self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'embedding', [len(symbols), 256], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # embedded_inputs = inputs # Encoder # n_fft = (self._hparams.num_src_freq - 1) * 2 # in_layer_size = n_fft in_layer_size = self._hparams.num_src_freq prenet_outputs = prenet(inputs, is_training, layer_sizes=[in_layer_size, 128]) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' input: %d' % inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] with tf.variable_scope('Encoder') as scope: x = embedded_inputs #3 Conv Layers for i in range(3): x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) encoder_conv_output = x #bi-directional LSTM cell_fw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM') outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: if hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( 128, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'step_bah': attention_mechanism = BahdanauStepwiseMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, mode="parallel") elif hp.attention_type == 'mon_bah': attention_mechanism = BahdanauMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loung': attention_mechanism = LuongAttention( 128, encoder_outputs, memory_sequence_length=input_lengths) # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence. #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( dec_outputs, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets #self.stop_token_targets = stop_token_targets #self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) #log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings symbols_length = 149 # BASED ON PREVIOUS LENGTH OF LIST embedding_table = tf.get_variable( 'embedding', [symbols_length, hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required """ with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_lstm( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = AttentionWrapper( # [N, T_in, attention_depth=256] DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training, hp.prenet_depths), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ concat_cell, LSTMBlockCell(hp.decoder_gru_units), LSTMBlockCell(hp.decoder_gru_units) ], state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' PML out: {}'.format(pml_outputs.shape[-1]))