def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None): with tf.variable_scope('inference') as scope: is_training = False # No entrenando batch_size = tf.shape(inputs)[0] hp = self._hparams # Tabla de embeddings de speaker embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder: Capas convolucionales y LSTM encoder_cell = TacotronEncoderCell( EncoderConvolutions( is_training, kernel_size=(5, ), channels=512, scope='encoder_convolutions' ), EncoderRNN( is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM') ) # Salida Encoder encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # Decoder ### Definir elementos para el Decoder # Prenet: Dos capas de 256 unidades ReLU prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet') # Red Atencion print("PARAMS ATTENTION", hp.attention_dim, encoder_outputs) attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs ) # Decoder LSTM decoder_lstm = DecoderRNN( is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm' ) # Frame projection: proyectar resultados a 80 mels frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # Stop projection: Projectar salida usando token para separar las palabras individuales stop_projection = StopProjection(is_training, scope='stop_token_projection') # Unir todo decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) # self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio) # Modo de sintesis self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) # Poner Decoder en estado inicial - zero state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) max_iters = hp.max_iters if not is_training else None (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs: 1 output por entrada decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet: cinco capas convolucionales postnet = Postnet( is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions' ) # Resultados results = postnet(decoder_output) # Proyectar resultados a 80 mels results_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_results = results_projection(results) # Calcular espectrograma mel mel_outputs = decoder_output + projected_results # Tomar alineacion del ultimo estado del decoder alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets
def initialize(self, inputs, inputs_tone_stress, speaker_labels, language_labels, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs speaker_labels: note the speaker id language_labels:note the language id - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) # self.inputs_printout = inputs # self.inputs_tone_stress_printout = inputs_tone_stress split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_speaker_labels = tf.split( speaker_labels, num_or_size_splits=hp.tacotron_num_gpus, axis=0) # tower_language_labels = tf.split(language_labels, num_or_size_splits=hp.tacotron_num_gpus, axis=0) p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_inputs_tone_stress = tf.py_func( split_func, [inputs_tone_stress, split_infos[:, 0]], lout_int) p_language_labels = tf.py_func( split_func, [language_labels, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_inputs_tone_stress = [] tower_language_labels = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) tower_inputs_tone_stress.append( tf.reshape(p_inputs_tone_stress[i], [batch_size, -1])) tower_language_labels.append( tf.reshape(p_language_labels[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_predict_speaker_labels = [] # 添加分别的phoneme embedding和 声调重读embedding 和 concat的inputs embedding tower_embedded_inputs_phoneme = [] tower_embedded_inputs_tone_stress = [] tower_embedded_inputs_concat = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # tf.print(tower_inputs[i]) # tf.print(tower_inputs[i]) # phoneme Embeddings ==> [batch_size, sequence_length, embedding_dim], 512 self.phoneme_embedding_table = tf.get_variable( 'inputs_phoneme_embedding', [len(symbols), hp.phoneme_embedding_dim], dtype=tf.float32) embedded_inputs_phoneme = tf.nn.embedding_lookup( self.phoneme_embedding_table, tower_inputs[i]) # tone and stress Embeddings ==> [batch_size, sequence_length, embedding_dim], 16 self.tone_stress_embedding_table = tf.get_variable( 'inputs_tone_stress_embedding', [ tone_stress_symbols_max_no, hp.tone_stress_embedding_dim ], dtype=tf.float32) embedded_inputs_tone_stress = tf.nn.embedding_lookup( self.tone_stress_embedding_table, tower_inputs_tone_stress[i]) # 拼接, 512 + 16 embedded_inputs_concat = tf.concat( [embedded_inputs_phoneme, embedded_inputs_tone_stress], axis=-1) self.speaker_embedding_table = tf.get_variable( 'speaker_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32) embedded_speaker_label = tf.nn.embedding_lookup( self.speaker_embedding_table, tower_speaker_labels[i]) self.language_embedding_table = tf.get_variable( 'language_embedding', [hp.language_num, hp.language_dim], dtype=tf.float32) embedded_language_label = tf.nn.embedding_lookup( self.language_embedding_table, tower_language_labels[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs_concat, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #language concat # input_seq_len = tf.shape(encoder_outputs)[1] # _embedded_language_label = tf.expand_dims(embedded_language_label, axis=1) # _embedded_language_label = tf.tile(_embedded_language_label, multiples=[1, input_seq_len, 1]) self.input_seq_len = tf.shape(encoder_outputs)[1] self.language_len = tf.shape(embedded_language_label)[1] self.language_id_print = tower_language_labels[i] self.tone_stress_print = tower_inputs_tone_stress[i] LID_encoder_outputs = tf.concat( [encoder_outputs, embedded_language_label], axis=-1) # Adversarial Speaker-Classifiers, input:encoder_output,output:predicted speaker_label speaker_classify = Speaker_Classifier( is_training, layer_size=hp.softmax_hidden_layer, speaker_size=hp.speaker_num) predict_speaker_labels = speaker_classify( LID_encoder_outputs, hp.grad_rev_scale) # Variational AutoEncoder if is_training: VAE_cell = VAECell( VAEConvolutions(is_training, hparams=hp, scope='VAE_convolutions'), VAERNN(is_training, layers=hp.VAE_lstm_num_layers, size=hp.VAE_lstm_layer_size, zoneout=hp.tacotron_zoneout_rate, scope='VAE_LSTM'), hp.VAE_pool_size, hp.VAE_D_size) residual_encoding, self.kl_div, self.D_mean, self.D_var = VAE_cell( tower_mel_targets[i], hp.tacotron_batch_size) elif is_evaluating: residual_encoding, self.kl_div = tf.zeros( [hp.tacotron_batch_size, hp.VAE_D_size], dtype=tf.float32), 0 else: residual_encoding = tf.zeros( [hp.tacotron_synthesis_batch_size, hp.VAE_D_size], dtype=tf.float32) self.residual_encoding = residual_encoding #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, LID_encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, embedded_speaker_label, # embedded_language_label, residual_encoding, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) self.tower_predict_speaker_labels.append( predict_speaker_labels) tower_embedded_inputs_phoneme.append( embedded_inputs_phoneme) tower_embedded_inputs_tone_stress.append( embedded_inputs_tone_stress) tower_embedded_inputs_concat.append(embedded_inputs_concat) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(LID_encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_inputs_tone_stress = tower_inputs_tone_stress self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.tower_speaker_labels = tower_speaker_labels self.tower_language_labels = tower_language_labels self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' phoneme embedding: {}'.format( tower_embedded_inputs_phoneme[i].shape)) log(' tone stress embedding: {}'.format( tower_embedded_inputs_tone_stress[i].shape)) log(' concat embedding: {}'.format( tower_embedded_inputs_concat[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None, logs_enabled=True): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_gru( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, gru_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): cells = [ GRUCell(hp.decoder_gru_units) for _ in range(hp.decoder_gru_layers) ] decoder_cell = MultiRNNCell( [concat_cell] + cells, state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: if hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and GRU layer: expand_outputs = conv_and_gru( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, gru_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, style_transfer=True, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta and not style_transfer: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) if style_transfer and (not is_training) and (not is_evaluating): assert self._hparams.tacotron_style_reference_audio is not None or \ (self._hparams.tacotron_style_alignment is not None and len(self._hparams.tacotron_style_alignment) == self._hparams.tacotron_n_style_token) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape # style token layers self.style_embedding_table = tf.get_variable( 'style_token_embedding', [hp.tacotron_n_style_token, hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # in order to synthese audio in random weights, style_encoder_outputs[-1]==style_embedding_table[-1] if (is_training or is_evaluating) and style_transfer: style_encoder_cell = TacotronReferenceEncoderCell( ReferenceEncoder( hp, layer_sizes=hp.tacotron_reference_layer_size, is_training=is_training, activation=tf.nn.relu), tf.nn.rnn_cell.GRUCell( num_units=hp.tacotron_reference_gru_hidden_size ), StyleTokenLayer( output_size=hp. tacotron_style_encoder_outputs_size, is_training=is_training), hparams=hp) # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2] style_encoder_outputs = style_encoder_cell( tower_mel_targets[i], input_lengths=tower_targets_lengths[i], style_token_embedding=self.style_embedding_table) # [batch_size,seq_len,hp.encoder_lstm_units*2] # encoder_outputs = encoder_outputs + style_encoder_outputs # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_encoder_outputs = tf.tile( style_encoder_outputs, multiples=[1, seq_len, 1]) encoder_outputs = tf.concat( [encoder_outputs, style_encoder_outputs], axis=-1) # concat elif (not is_training) and ( not is_evaluating ) and style_transfer: # synthesis with style transfer if hp.tacotron_style_alignment is not None and \ len(hp.tacotron_style_alignment) == hp.tacotron_n_style_token: # random weights # [n_style_tokens,] style_alignment = tf.convert_to_tensor( hp.tacotron_style_alignment, dtype=tf.float32) # [n_style_tokens,1]*[n_style_tokens, embed_size] -> [n_style_tokens,embed_size] -> [embed_size,] style_context = tf.reduce_sum( tf.expand_dims(style_alignment, axis=-1) * self.style_embedding_table, axis=0) # [1,embed_size] style_context = tf.expand_dims(style_context, axis=0) # [batch_size,1,embed_size], all synthesis audio should be one style style_context = tf.tile( tf.expand_dims(style_context, axis=0), multiples=[batch_size, 1, 1]) # encoder_outputs = encoder_outputs + style_context # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_context = tf.tile(style_context, multiples=[1, seq_len, 1]) # dense? # style_context = tf.layers.dense(style_context, units=hp.tacotron_style_encoder_outputs_size, # activation=tf.nn.tanh) encoder_outputs = tf.concat( [encoder_outputs, style_context], axis=-1) # concat elif len(tower_mel_targets ) > 0 and tower_mel_targets[i] is not None: # reference audio style_encoder_cell = TacotronReferenceEncoderCell( ReferenceEncoder(hp, layer_sizes=hp. tacotron_reference_layer_size, is_training=is_training, activation=tf.nn.relu), tf.nn.rnn_cell.GRUCell( num_units=hp. tacotron_reference_gru_hidden_size), StyleTokenLayer( output_size=hp. tacotron_style_encoder_outputs_size, is_training=is_training), hparams=hp) # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2(hp.tacotron_style_encoder_outputs_size)] style_encoder_outputs = style_encoder_cell( tower_mel_targets[i], # audio style reference audio input_lengths=tower_targets_lengths[i], style_token_embedding=self. style_embedding_table) # [batch_size,seq_len,hp.encoder_lstm_units*2] # encoder_outputs = encoder_outputs + style_encoder_outputs # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_encoder_outputs = tf.tile( style_encoder_outputs, multiples=[1, seq_len, 1]) encoder_outputs = tf.concat( [encoder_outputs, style_encoder_outputs], axis=-1) # concat # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') # Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') # <stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) # only generate r frames(num_mels*r) in one decoder step, [batch_size,1,num_mels*r] # then put this decoder_cell with helper into dynamic_decode to generate [batch_size,decoder_steps,num_mels*r] decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None): with tf.variable_scope('inference') as scope: is_training = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=(5, ), channels=512, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # Define elements for decoder prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # <stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) if is_training is True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) max_iters = hp.max_iters if not is_training else None (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets
def initialize(self, inputs, input_speaker_id, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] # embedding_table = tf.get_variable( # 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Speaker Embeddings ==> [batch_size, embedding_dim] self.speaker_id_embedding_table = tf.get_variable( 'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_speaker_id = tf.nn.embedding_lookup( self.speaker_id_embedding_table, input_speaker_id) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) print('inputs:', inputs) # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9) encoder_outputs = encoder_cell(inputs, input_lengths) #first change encoder_outputs to concated version. #second add. need same dims #encoder_outputs = encoder_outputs + embedded_speaker_id ''' #first concat. input_seq_len = tf.shape(encoder_outputs)[1] print('!!!!!!!!!!before tile') embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1) embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1]) print('!!!!!!!!!!after tile') id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1) ''' id_encoder_outputs = encoder_outputs #still use encoder_outputs #use keras version, but not run. ''' print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh') hp_lambda = 1.0 Flip = GradientReversal(hp_lambda) Flip_encoder_outputs = Flip(encoder_outputs) ''' ''' #use tensorflow version, but star's is only 5 and i don't understand. Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0) print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs)) densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu) softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax) long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1]) tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]]) print('tiled_speaker_id', tiled_speaker_id) one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num) print('one_hot_speaker_id', one_hot_speaker_id) #self.one_hot_speaker_id and self.softmax_encoder_outputs is at below #long_speaker_id = tf.expand_dims(long_speaker_id, axis=2) #dann_out = Dense(2)(dann_in) #Flip_encoder_outputs = ''' #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, id_encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_speaker_id = input_speaker_id #self.one_hot_speaker_id and self.softmax_encoder_outputs #self.softmax_encoder_outputs = softmax_encoder_outputs #self.one_hot_speaker_id = one_hot_speaker_id self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' id encoder out: {}'.format( id_encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, split_infos=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.device('/cpu:0'): hp = self._hparams lout_int = [tf.int32] * hp.num_gpus lout_float = [tf.float32] * hp.num_gpus tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], lout_float) p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_linear_targets = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus) ] for i in range(hp.num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tower_input_lengths[i], smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], tower_stop_token_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions( is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) self.tower_linear_targets.append(linear_targets) log('initialiized done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.num_gpus + hp.gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.towerlinear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required """ with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_lstm( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = AttentionWrapper( # [N, T_in, attention_depth=256] DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training, hp.prenet_depths), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ concat_cell, LSTMBlockCell(hp.decoder_gru_units), LSTMBlockCell(hp.decoder_gru_units) ], state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' PML out: {}'.format(pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "pml_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry decoder_outputs = tf.reshape( multi_decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Postnet: predicts a residual postnet_outputs = postnet(decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) pml_outputs = decoder_outputs + postnet_outputs # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, multi_decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None, mel_references=None, speaker_id_target=None, nb_speaker=None, nb_sample=None, synthesize=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs ### With tacotron_phoneme_transcription=True : ### inputs: int32 Tensor with shape [N, T_in, T_cat] where N is batch size, T_in is number of ### steps in the input time series, T_cat is the number of categories used to describe phones/special ### characters and values are phones/special character encoding - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets if self._hparams.tacotron_multi_speaker: tower_speaker_id_target = tf.split( speaker_id_target, num_or_size_splits=hp.tacotron_num_gpus, axis=0 ) if speaker_id_target is not None else speaker_id_target tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): if not self._hparams.tacotron_phoneme_transcription: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1])) else: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1, nb_categories])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] if self._hparams.tacotron_multi_speaker: tower_speaker_id_pred = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta if not self._hparams.tacotron_phoneme_transcription: # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) else: # Embeddings ==> [batch_size, sequence_length, embedding_dim] # print("inputs : {}".format(inputs)) # print("p_inputs : {}".format(p_inputs)) # print("tower_inputs : {}".format(tower_inputs)) embedded_inputs = tf.layers.dense( inputs=tf.cast(tower_inputs[i], dtype=tf.float32), units=hp.embedding_dim) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape enc_tensor_shape = tf.shape(encoder_outputs) # Speaker encoder if self._hparams.tacotron_multi_speaker: with tf.variable_scope('speaker_encoder'): conv = tf.layers.conv1d(inputs=mel_references, filters=512, kernel_size=(3, ), activation=None, padding='same') conv = tf.layers.conv1d(inputs=conv, filters=512, kernel_size=(3, ), activation=None, padding='same') with tf.variable_scope('biLSTM1'): fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_LSTM, cell_bw=bw_LSTM, inputs=conv, dtype=tf.float32) outputs_biLSTM = tf.concat(outputs_biLSTM, axis=2) with tf.variable_scope('biLSTM2'): fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_LSTM, cell_bw=bw_LSTM, inputs=outputs_biLSTM, dtype=tf.float32) outputs_biLSTM = tf.concat(outputs_biLSTM, axis=2) mean_over_time = tf.reduce_mean( input_tensor=outputs_biLSTM, axis=1) # shape (batch_size, speaker_emb_dim) speaker_embedding = tf.layers.dense( inputs=mean_over_time, units=self._hparams. tacotron_speaker_embedding_dim, name="speaker_embedding") repeat = tf.tile(speaker_embedding, [ 1, enc_tensor_shape[1] ]) # shape (batch_size, speaker_emb_dim*text_len) repeat = tf.reshape( repeat, (enc_tensor_shape[0], enc_tensor_shape[1], self._hparams.tacotron_speaker_embedding_dim) ) # shape (batch_size, text_len, speaker_emb_dim) # self.embedding_speaker = tf.get_variable('speaker_embedding', [nb_sample, self._hparams.tacotron_speaker_embedding_dim], dtype=tf.float32) self.embedding_speaker = speaker_embedding attention_inputs = tf.concat([encoder_outputs, repeat], axis=-1) if not synthesize: with tf.variable_scope('speaker_classifier'): speaker_prediction = tf.layers.dense( inputs=speaker_embedding, units=256) speaker_prediction = tf.layers.dense( inputs=speaker_prediction, units=nb_speaker ) # Softmax inside the loss else: attention_inputs = encoder_outputs # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, attention_inputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') # Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') # <stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if not synthesize: if self._hparams.tacotron_multi_speaker: tower_speaker_id_pred.append(speaker_prediction) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets if self._hparams.tacotron_multi_speaker: self.tower_speaker_id_pred = tower_speaker_id_pred self.tower_speaker_id_target = tower_speaker_id_target self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, eal=False, locked_alignments=None, logs_enabled=True, flag_trainAlign=False, flag_trainJoint=False, alignScale=1.0, flag_online_eal_eval=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments self.flag_trainAlign = flag_trainAlign self.flag_trainJoint = flag_trainJoint self.alignScale = alignScale self.flag_online_eal = ( eal and (locked_alignments is None)) or flag_online_eal_eval if locked_alignments_ is not None: if is_training and eal: pass elif np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif eal: if self.flag_online_eal: helper_gta = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) helper_eal = TacoTrainingHelper_EAL( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTrainingHelper_EAL(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: if is_training: log('For training, one of these should be true: gta, eal, hp.scheduled_sampling' ) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) if flag_online_eal_eval: helper_gta = helper helper_eal = helper if not self.flag_online_eal: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) else: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper_gta, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: locked_alignments_ = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) with tf.variable_scope('inference_eal') as scope: if self.flag_online_eal: # Embeddings embedding_table_eal = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs_eal = tf.nn.embedding_lookup( embedding_table_eal, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs_eal = prenet( embedded_inputs_eal, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs_eal = encoder_cbhg( prenet_outputs_eal, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # import pdb; pdb.set_trace() # tf.get_variable_scope().reuse_variables() # Attention # tmp = None if flag_online_eal_eval else locked_alignments_ if flag_online_eal_eval: locked_alignments_ = None attention_cell_eal = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs_eal), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell_eal = ConcatOutputAndAttentionWrapper( prenet_cell_eal) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell_eal = MultiRNNCell( [ OutputProjectionWrapper(concat_cell_eal, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell_eal = OutputProjectionWrapper( decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step) decoder_init_state_eal = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) ( decoder_outputs_eal, _ ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell_eal, helper_eal, decoder_init_state_eal), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates_eal = tf.reshape( decoder_outputs_eal, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs_eal = post_cbhg( pml_intermediates_eal, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs_eal = tf.layers.dense( post_outputs_eal, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state_eal[0].alignment_history.stack(), [1, 2, 0]) self.pml_intermediates_eal = pml_intermediates_eal self.pml_outputs_eal = pml_outputs_eal with tf.variable_scope('inference') as scope: self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell self.locked_alignments = locked_alignments_ if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(gta)) log(' EAL mode: {}'.format(eal)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Prenet out: {}'.format( prenet_outputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Postnet out: {}'.format( post_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, targets_lengths=None, global_step=None, is_training=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None: raise ValueError( 'Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) #Extend style embeddings to be compatible with encoder_outputs. #Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes. #Preserves effect of both style and encoder_outputs. neg = tf.add(style_embeddings, tf.negative(style_embeddings)) style_embeddings = tf.concat([style_embeddings, neg], axis=-1) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.add(encoder_outputs, style_embeddings) #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.style_embeddings = style_embeddings self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams if hp.tacotron_curriculum_dropout_rate: assert global_step is not None self.dropout_rate = self._curriculum_dropout( hp.tacotron_dropout_rate, hp.tacotron_curriculum_dropout_gamma, global_step) else: self.dropout_rate = tf.convert_to_tensor( hp.tacotron_dropout_rate) if hp.tacotron_curriculum_zoneout_rate: assert global_step is not None self.zoneout_rate = self._curriculum_dropout( hp.tacotron_zoneout_rate, hp.tacotron_curriculum_zoneout_gamma, global_step) else: self.zoneout_rate = tf.convert_to_tensor( hp.tacotron_zoneout_rate) assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=self.dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=self.zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: if mel_targets is not None and stop_token_targets is not None: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: if gta: log('Warning: gta set to True but mel_targets or ' + 'mel_targets or stop_token_targets not provided' + ', falling back to natural inference') self.helper = TacoTestHelper(batch_size, hp) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hp.postnet_kernel_size, hp.postnet_channels, hp.postnet_num_layers, self.dropout_rate, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = tf.add(decoder_output, projected_residual, name='mel_outputs') if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') self.optimize = None self.loss = None if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if gta == False and self._hparams.predict_linear == True and linear_targets is None: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) if hp.use_vae: if is_training: reference_mel = mel_targets style_embeddings, mu, log_var = VAE(inputs=reference_mel, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.reference_mel = reference_mel if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.mel_lengths = mel_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs=None, input_lengths=None, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, mel_references=None, references_lengths=None, vae_codes=None, gta=False, global_step=None, use_vae=False, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ self.use_vae = use_vae if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) if use_vae and mel_references is None and vae_codes is None: if mel_targets is None: raise ValueError( 'Mel targets must be provided if neither mel references nor references codes are given!' ) else: mel_references = mel_targets if use_vae and references_lengths is None and vae_codes is None: if targets_lengths is None: raise ValueError( 'Targets lengths must be provided if neither references lengths nor references codes are given!' ) else: references_lengths = targets_lengths split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if input_lengths is not None else input_lengths tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_references_lengths = tf.split( references_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0 ) if references_lengths is not None else references_lengths tower_vae_codes = tf.split( vae_codes, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if vae_codes is not None else vae_codes p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) if inputs is not None else inputs p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_mel_references = tf.py_func( split_func, [mel_references, split_infos[:, 1]], lout_float) if mel_references is not None else mel_references p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_mel_references = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape( inputs)[0] if inputs is not None else tf.shape( mel_references)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): if p_inputs is not None: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_mel_references is not None: tower_mel_references.append( tf.reshape(p_mel_references[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_vae_codes = [] self.tower_mu = [] self.tower_log_var = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: #VAE Parts if use_vae and inputs is None: vae_code = None if mel_references is not None: vae_code, mu, log_var = VAE( inputs=tower_mel_references[i], input_lengths=tower_references_lengths[i], filters=hp.vae_filters, kernel_size=hp.vae_kernel, stride=hp.vae_stride, num_units=hp.vae_dim, rnn_units=hp.vae_rnn_units, bnorm=hp.batch_norm_position, log_var_minimum=hp.vae_log_var_minimum, is_training=is_training, scope='vae') #Use vae_codes first if provided if vae_codes is not None: vae_code = tower_vae_codes[i] self.tower_vae_codes.append(vae_code) if mel_references is not None: self.tower_mu.append(mu) self.tower_log_var.append(log_var) if inputs is not None: assert hp.tacotron_teacher_forcing_mode in ( 'constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #When predicting mels to train Wavenet vocoder, post processing should be ommited post_condition = hp.predict_linear # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape vae_code = None if use_vae: if mel_references is not None: vae_code, mu, log_var = VAE( inputs=tower_mel_references[i], input_lengths=tower_references_lengths[i], filters=hp.vae_filters, kernel_size=hp.vae_kernel, stride=hp.vae_stride, num_units=hp.vae_dim, rnn_units=hp.vae_rnn_units, bnorm=hp.batch_norm_position, log_var_minimum=hp.vae_log_var_minimum, is_training=is_training, scope='vae') #Use vae_codes first if provided if vae_codes is not None: vae_code = tower_vae_codes[i] if hp.encoder_outputs_revision_type == 'add': encoder_outputs = encoder_outputs + tf.tile( tf.expand_dims(tf.layers.dense( vae_code, hp.encoder_lstm_units * 2, name='vae_code_projection', activation='tanh'), axis=1), [1, tf.shape(encoder_outputs)[1], 1]) elif hp.encoder_outputs_revision_type == 'multiply': encoder_outputs = tf.math.multiply( encoder_outputs, tf.tile( tf.expand_dims(tf.layers.dense( vae_code, hp.encoder_lstm_units * 2, name='vae_code_projection', activation='tanh'), axis=1), [1, tf.shape(encoder_outputs)[1], 1])) #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN( is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, vae_code, hp) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape( frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape( stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection( post_outputs) if hp.clip_outputs: linear_outputs = tf.minimum( tf.maximum( linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append( enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) if use_vae: self.tower_vae_codes.append(vae_code) if mel_references is not None: self.tower_mu.append(mu) self.tower_log_var.append(log_var) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) if inputs is not None: log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) if use_vae: log(' vae code: {}'.format( self.tower_vae_codes[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "feature_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are entries in the spectrogram. Only needed for training. """ if feature_targets is None and stop_token_targets is not None: raise ValueError('no feature targets were provided but token_targets were given') if feature_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, target_depth, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth] decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth]) stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_outputs) #Project residual to same dimension as target depth #==> [batch_size, decoder_steps * r, target_depth] residual_projection = FrameProjection(target_depth, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the final outputs final_outputs = decoder_outputs + projected_residual #Compute each feature outputs mgc_idx = 0 lf0_idx = mgc_idx + hp.num_mgc vuv_idx = lf0_idx + hp.num_lf0 bap_idx = vuv_idx + hp.num_vuv mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs') lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0]) lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs') vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs') bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs') #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.final_outputs = final_outputs self.feature_targets = feature_targets self.alignments = alignments self.stop_token_outputs = stop_token_outputs self.stop_token_targets = stop_token_targets self.lf0_outputs = lf0_outputs self.mgc_outputs = mgc_outputs self.vuv_outputs = vuv_outputs self.bap_outputs = bap_outputs self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_outputs.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' final out: {}'.format(final_outputs.shape)) log(' lf0 out: {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape)) log(' mgc out: {}'.format(mgc_outputs.shape)) log(' vuv out: {}'.format(vuv_outputs.shape)) log(' bap out: {}'.format(bap_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_outputs.shape))