def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None, logs_enabled=True): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_gru( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, gru_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): cells = [ GRUCell(hp.decoder_gru_units) for _ in range(hp.decoder_gru_layers) ] decoder_cell = MultiRNNCell( [concat_cell] + cells, state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: if hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and GRU layer: expand_outputs = conv_and_gru( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, gru_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) #split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:1' print("debug checkpoint gpus:", split_device) #with tf.device(split_device): with tf.device("/cpu:0"): # debug, gpu:0 will use about 192MB memory hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices #gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)] gpus = ["/gpu:1"] print("debug checkpoint gpus:", gpus) for i in range(hp.tacotron_num_gpus): # with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device="/cpu:0") ): # debug using gpu:0 will cause OOM, use >1640MB with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape print("debug enc_conv_output_shape:", enc_conv_output_shape) with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device="/cpu:0") ): # debug try to use as more gpu as possible with tf.variable_scope('inference') as scope: #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device="/gpu:0") ): # debug try to use as more gpu as possible with tf.variable_scope('inference') as scope: #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): hp = self._hparams batch_size = tf.shape(inputs)[0] gta = False T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose self.enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = GMMAttention(self.encoder_outputs, input_lengths, is_training) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, num_attn_mixture=5) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] self.decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) self.stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: self.decoder_output = tf.minimum( tf.maximum(self.decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(self.decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') self.projected_residual = residual_projection(residual) #Compute the mel spectrogram self.mel_outputs = self.decoder_output + self.projected_residual if hp.clip_outputs: self.mel_outputs = tf.minimum( tf.maximum(self.mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] self.post_outputs = post_cbhg(self.mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] self.linear_outputs = linear_specs_projection(self.post_outputs) if hp.clip_outputs: self.linear_outputs = tf.minimum( tf.maximum(self.linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state self.alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) log('initialisation done.') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.mel_targets = mel_targets self.linear_targets = linear_targets self.targets_lengths = targets_lengths self.stop_token_targets = stop_token_targets self.gta = gta self.all_vars = tf.trainable_variables() self.is_training = is_training self.is_evaluating = is_evaluating log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format( self.enc_conv_output_shape)) log(' encoder out: {}'.format( self.encoder_outputs.shape)) log(' decoder out: {}'.format(self.decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( self.projected_residual.shape)) log(' mel out: {}'.format(self.mel_outputs.shape)) log(' linear out: {}'.format(self.linear_outputs.shape)) log(' <stop_token> out: {}'.format( self.stop_token_prediction.shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None): with tf.variable_scope('inference') as scope: is_training = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=(5, ), channels=512, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # Define elements for decoder prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # <stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) if is_training is True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) max_iters = hp.max_iters if not is_training else None (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense(post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_intermediates.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' pml out: %d' % pml_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, split_infos=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.device('/cpu:0'): hp = self._hparams lout_int = [tf.int32] * hp.num_gpus lout_float = [tf.float32] * hp.num_gpus tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], lout_float) p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_linear_targets = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus) ] for i in range(hp.num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tower_input_lengths[i], smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], tower_stop_token_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions( is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) self.tower_linear_targets.append(linear_targets) log('initialiized done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.num_gpus + hp.gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.towerlinear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required """ with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_lstm( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = AttentionWrapper( # [N, T_in, attention_depth=256] DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training, hp.prenet_depths), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ concat_cell, LSTMBlockCell(hp.decoder_gru_units), LSTMBlockCell(hp.decoder_gru_units) ], state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' PML out: {}'.format(pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if gta == False and self._hparams.predict_linear == True and linear_targets is None: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) if hp.use_vae: if is_training: reference_mel = mel_targets style_embeddings, mu, log_var = VAE(inputs=reference_mel, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.reference_mel = reference_mel if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.mel_lengths = mel_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, eal=False, locked_alignments=None, logs_enabled=True, flag_trainAlign=False, flag_trainJoint=False, alignScale=1.0, flag_online_eal_eval=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments self.flag_trainAlign = flag_trainAlign self.flag_trainJoint = flag_trainJoint self.alignScale = alignScale self.flag_online_eal = ( eal and (locked_alignments is None)) or flag_online_eal_eval if locked_alignments_ is not None: if is_training and eal: pass elif np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif eal: if self.flag_online_eal: helper_gta = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) helper_eal = TacoTrainingHelper_EAL( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTrainingHelper_EAL(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: if is_training: log('For training, one of these should be true: gta, eal, hp.scheduled_sampling' ) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) if flag_online_eal_eval: helper_gta = helper helper_eal = helper if not self.flag_online_eal: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) else: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper_gta, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: locked_alignments_ = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) with tf.variable_scope('inference_eal') as scope: if self.flag_online_eal: # Embeddings embedding_table_eal = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs_eal = tf.nn.embedding_lookup( embedding_table_eal, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs_eal = prenet( embedded_inputs_eal, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs_eal = encoder_cbhg( prenet_outputs_eal, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # import pdb; pdb.set_trace() # tf.get_variable_scope().reuse_variables() # Attention # tmp = None if flag_online_eal_eval else locked_alignments_ if flag_online_eal_eval: locked_alignments_ = None attention_cell_eal = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs_eal), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell_eal = ConcatOutputAndAttentionWrapper( prenet_cell_eal) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell_eal = MultiRNNCell( [ OutputProjectionWrapper(concat_cell_eal, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell_eal = OutputProjectionWrapper( decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step) decoder_init_state_eal = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) ( decoder_outputs_eal, _ ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell_eal, helper_eal, decoder_init_state_eal), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates_eal = tf.reshape( decoder_outputs_eal, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs_eal = post_cbhg( pml_intermediates_eal, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs_eal = tf.layers.dense( post_outputs_eal, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state_eal[0].alignment_history.stack(), [1, 2, 0]) self.pml_intermediates_eal = pml_intermediates_eal self.pml_outputs_eal = pml_outputs_eal with tf.variable_scope('inference') as scope: self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell self.locked_alignments = locked_alignments_ if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(gta)) log(' EAL mode: {}'.format(eal)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Prenet out: {}'.format( prenet_outputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Postnet out: {}'.format( post_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, targets_lengths=None, global_step=None, is_training=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None: raise ValueError( 'Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) #Extend style embeddings to be compatible with encoder_outputs. #Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes. #Preserves effect of both style and encoder_outputs. neg = tf.add(style_embeddings, tf.negative(style_embeddings)) style_embeddings = tf.concat([style_embeddings, neg], axis=-1) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.add(encoder_outputs, style_embeddings) #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.style_embeddings = style_embeddings self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_speaker_id, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] # embedding_table = tf.get_variable( # 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Speaker Embeddings ==> [batch_size, embedding_dim] self.speaker_id_embedding_table = tf.get_variable( 'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_speaker_id = tf.nn.embedding_lookup( self.speaker_id_embedding_table, input_speaker_id) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) print('inputs:', inputs) # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9) encoder_outputs = encoder_cell(inputs, input_lengths) #first change encoder_outputs to concated version. #second add. need same dims #encoder_outputs = encoder_outputs + embedded_speaker_id ''' #first concat. input_seq_len = tf.shape(encoder_outputs)[1] print('!!!!!!!!!!before tile') embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1) embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1]) print('!!!!!!!!!!after tile') id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1) ''' id_encoder_outputs = encoder_outputs #still use encoder_outputs #use keras version, but not run. ''' print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh') hp_lambda = 1.0 Flip = GradientReversal(hp_lambda) Flip_encoder_outputs = Flip(encoder_outputs) ''' ''' #use tensorflow version, but star's is only 5 and i don't understand. Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0) print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs)) densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu) softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax) long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1]) tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]]) print('tiled_speaker_id', tiled_speaker_id) one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num) print('one_hot_speaker_id', one_hot_speaker_id) #self.one_hot_speaker_id and self.softmax_encoder_outputs is at below #long_speaker_id = tf.expand_dims(long_speaker_id, axis=2) #dann_out = Dense(2)(dann_in) #Flip_encoder_outputs = ''' #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, id_encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_speaker_id = input_speaker_id #self.one_hot_speaker_id and self.softmax_encoder_outputs #self.softmax_encoder_outputs = softmax_encoder_outputs #self.one_hot_speaker_id = one_hot_speaker_id self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' id encoder out: {}'.format( id_encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "feature_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are entries in the spectrogram. Only needed for training. """ if feature_targets is None and stop_token_targets is not None: raise ValueError('no feature targets were provided but token_targets were given') if feature_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, target_depth, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth] decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth]) stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_outputs) #Project residual to same dimension as target depth #==> [batch_size, decoder_steps * r, target_depth] residual_projection = FrameProjection(target_depth, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the final outputs final_outputs = decoder_outputs + projected_residual #Compute each feature outputs mgc_idx = 0 lf0_idx = mgc_idx + hp.num_mgc vuv_idx = lf0_idx + hp.num_lf0 bap_idx = vuv_idx + hp.num_vuv mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs') lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0]) lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs') vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs') bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs') #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.final_outputs = final_outputs self.feature_targets = feature_targets self.alignments = alignments self.stop_token_outputs = stop_token_outputs self.stop_token_targets = stop_token_targets self.lf0_outputs = lf0_outputs self.mgc_outputs = mgc_outputs self.vuv_outputs = vuv_outputs self.bap_outputs = bap_outputs self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_outputs.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' final out: {}'.format(final_outputs.shape)) log(' lf0 out: {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape)) log(' mgc out: {}'.format(mgc_outputs.shape)) log(' vuv out: {}'.format(vuv_outputs.shape)) log(' bap out: {}'.format(bap_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_outputs.shape))
def initialize(self, inputs, speaker, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, reference_mels=None, \ reference_lengths=None, global_step=None, is_training=False, is_evaluating=False, split_infos=None,Lf0=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - speaker: int32 tensor with shape [N] - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_speaker = tf.split(speaker, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_reference_lengths = tf.split( reference_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if reference_lengths is not None else reference_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_float) p_Lf0 = tf.py_func(split_func, [Lf0, split_infos[:, 5]], lout_float) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets p_reference_mels = tf.py_func( split_func, [reference_mels, split_infos[:, 4]], lout_float) if reference_mels is not None else reference_mels tower_inputs = [] tower_Lf0 = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] tower_reference_mels = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1, 345])) tower_Lf0.append(tf.reshape( p_Lf0[i], [batch_size, -1, 2 ])) #2020.7.9.16:15 Notice! 2dims!!!here! if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) if p_reference_mels is not None: tower_reference_mels.append( tf.reshape(p_reference_mels[i], [batch_size, -1, mel_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.styleembedding = None tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] #self.embedding_Dense = tf.layers.Dense(units=hp.embedding_dim, activation=tf.nn.relu, name='emb_Dense') # self.embedding_table = tf.get_variable( # 'inputs_embedding', [hp.PPGs_length, hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) #embedded_inputs = self.embedding_Dense(tower_inputs[i]) embedded_inputs = tower_inputs[i] Lf0s = tower_Lf0[i] if hp.use_multispeaker: self.speaker_embedding_table = tf.get_variable( 'speaker_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32) speaker_embedding = tf.nn.embedding_lookup( self.speaker_embedding_table, tower_speaker[i]) self.speaker_embedding = speaker_embedding #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder = TacotronEncoder( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder(embedded_inputs, tower_input_lengths[i]) #F0-consists encoder_outputs = tf.concat([encoder_outputs, Lf0s], axis=-1) self.z_embedding = None if hp.use_style_encoder: # Reference input encoder. reference_embeddings = None if reference_mels is not None: reference_encoder = ReferenceEncoder( hp, is_training, scope='reference_encoder') reference_embeddings = reference_encoder( tower_reference_mels[i]) if hp.style_encoder_type == 'gst': style_encoder = GstEncoder(hp, is_training, scope='gst_encoder') style_outputs = style_encoder(reference_embeddings) elif hp.style_encoder_type == 'vae': style_encoder = VaeEncoder(hp, is_training, scope='vae_encoder') style_outputs = style_encoder( reference_embeddings, batch_size) self.z_mu = style_outputs['z_mu'] self.z_log_var = style_outputs['z_log_var'] self.z_embedding = style_outputs['z_embedding'] else: raise ValueError( "Only supported gst and vae and cvae!") style_embeddings = style_outputs[ 'style_embedding'] #[N,1,style_embed_depth] self.styleembedding = style_embeddings if hp.concat_style: style_embeddings = tf.tile( style_embeddings, [1, tf.shape(tower_inputs[i])[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, style_embeddings], axis=-1) else: # encoder_outputs += tf.nn.tanh(style_embeddings)# #修改成concat boraded_style = tf.tile( style_embeddings, [1, tf.shape(encoder_outputs)[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, boraded_style], axis=-1) if hp.use_multispeaker: speaker_embedding = tf.expand_dims(speaker_embedding, axis=1) speaker_embedding = tf.tile( speaker_embedding, [1, tf.shape(tower_inputs[i])[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, speaker_embedding], axis=-1) #For shape visualization purpose enc_conv_output_shape = encoder.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(hp, is_training, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveSoftAttention( hp.attention_dim, encoder_outputs, tf.reshape(tower_input_lengths[i], [-1])) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, self.z_embedding) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(hparams=hp, training=is_training, output_size=hp.num_mels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, num_mels] residual = postnet(decoder_output) #Compute the mel spectrogram mel_outputs = decoder_output + residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_speaker = tower_speaker self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.tower_reference_mels = tower_reference_mels self.tower_reference_lengths = tower_reference_lengths self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))