Ejemplo n.º 1
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                hparams=hp,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets,
                                                 stop_token_targets, hp, gta,
                                                 is_evaluating, global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG:
                post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                         is_training)  # [N, T_out, 256]
                linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)
            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Ejemplo n.º 2
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_gru(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                gru_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            cells = [
                GRUCell(hp.decoder_gru_units)
                for _ in range(hp.decoder_gru_layers)
            ]
            decoder_cell = MultiRNNCell(
                [concat_cell] + cells,
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                if hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and GRU layer:
            expand_outputs = conv_and_gru(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                gru_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(is_training))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Expand out:              {}'.format(
                    expand_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        #split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx)
        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:1'
        print("debug checkpoint gpus:", split_device)
        #with tf.device(split_device):
        with tf.device("/cpu:0"):  # debug, gpu:0 will use about 192MB memory
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        #gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)]
        gpus = ["/gpu:1"]
        print("debug checkpoint gpus:", gpus)
        for i in range(hp.tacotron_num_gpus):
            #	with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/cpu:0")
            ):  # debug using gpu:0 will cause OOM, use >1640MB
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape
                    print("debug enc_conv_output_shape:",
                          enc_conv_output_shape)

            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/cpu:0")
            ):  # debug try to use as more gpu as possible
                with tf.variable_scope('inference') as scope:

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/gpu:0")
            ):  # debug try to use as more gpu as possible
                with tf.variable_scope('inference') as scope:

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):

        hp = self._hparams
        batch_size = tf.shape(inputs)[0]
        gta = False

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        with tf.variable_scope('inference') as scope:
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            self.embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32)

            embedded_inputs = tf.nn.embedding_lookup(self.embedding_table,
                                                     inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            self.enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = GMMAttention(self.encoder_outputs,
                                               input_lengths, is_training)

            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               num_attn_mixture=5)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            self.decoder_output = tf.reshape(frames_prediction,
                                             [batch_size, -1, hp.num_mels])
            self.stop_token_prediction = tf.reshape(stop_token_prediction,
                                                    [batch_size, -1])

            if hp.clip_outputs:
                self.decoder_output = tf.minimum(
                    tf.maximum(self.decoder_output,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(self.decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            self.projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            self.mel_outputs = self.decoder_output + self.projected_residual

            if hp.clip_outputs:
                self.mel_outputs = tf.minimum(
                    tf.maximum(self.mel_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
            post_cbhg = CBHG(hp.cbhg_kernels,
                             hp.cbhg_conv_channels,
                             hp.cbhg_pool_size,
                             [hp.cbhg_projection, hp.num_mels],
                             hp.cbhg_projection_kernel_size,
                             hp.cbhg_highwaynet_layers,
                             hp.cbhg_highway_units,
                             hp.cbhg_rnn_units,
                             hp.batch_norm_position,
                             is_training,
                             name='CBHG_postnet')

            #[batch_size, decoder_steps(mel_frames), cbhg_channels]
            self.post_outputs = post_cbhg(self.mel_outputs, None)

            #Linear projection of extracted features to make linear spectrogram
            linear_specs_projection = FrameProjection(
                hp.num_freq, scope='cbhg_linear_specs_projection')

            #[batch_size, decoder_steps(linear_frames), num_freq]
            self.linear_outputs = linear_specs_projection(self.post_outputs)

            if hp.clip_outputs:
                self.linear_outputs = tf.minimum(
                    tf.maximum(self.linear_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Grab alignments from the final decoder state
            self.alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            log('initialisation done.')

        if is_training:
            self.ratio = self.helper._ratio

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        self.targets_lengths = targets_lengths
        self.stop_token_targets = stop_token_targets
        self.gta = gta
        self.all_vars = tf.trainable_variables()
        self.is_training = is_training
        self.is_evaluating = is_evaluating

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        log('  embedding:                {}'.format(embedded_inputs.shape))
        log('  enc conv out:             {}'.format(
            self.enc_conv_output_shape))
        log('  encoder out:              {}'.format(
            self.encoder_outputs.shape))
        log('  decoder out:              {}'.format(self.decoder_output.shape))
        log('  residual out:             {}'.format(residual.shape))
        log('  projected residual out:   {}'.format(
            self.projected_residual.shape))
        log('  mel out:                  {}'.format(self.mel_outputs.shape))
        log('  linear out:               {}'.format(self.linear_outputs.shape))
        log('  <stop_token> out:         {}'.format(
            self.stop_token_prediction.shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Ejemplo n.º 5
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None):

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=(5, ),
                                    channels=512,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            # Define elements for decoder
            prenet = Prenet(is_training,
                            layer_sizes=[256, 256],
                            scope='decoder_prenet')
            # Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            # Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            # Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            # <stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            if is_training is True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            max_iters = hp.max_iters if not is_training else None

            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            # Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            # Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            # Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            # Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
Ejemplo n.º 6
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
      pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
        steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
        features. Only needed for training.
      gta: boolean flag that is set to True when ground truth alignment is required
      locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
        parameter and the attention alignments are locked to these values
      logs_enabled: boolean flag that defaults to True, if False no construction logs output
    '''
        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                pml_intermediates,
                hp.pml_dimension,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            pml_outputs = tf.layers.dense(post_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % pml_intermediates.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  pml out:                 %d' % pml_outputs.shape[-1])
Ejemplo n.º 7
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   split_infos=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.device('/cpu:0'):
            hp = self._hparams
            lout_int = [tf.int32] * hp.num_gpus
            lout_float = [tf.float32] * hp.num_gpus

            tower_input_lengths = tf.split(input_lengths,
                                           num_or_size_splits=hp.num_gpus,
                                           axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths, num_or_size_splits=hp.num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(split_func,
                                       [mel_targets, split_infos[:, 1]],
                                       lout_float)
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                tower_mel_targets.append(
                    tf.reshape(p_mel_targets[i],
                               [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.tower_linear_targets = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus)
        ]
        for i in range(hp.num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tower_input_lengths[i],
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_lstm')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i],
                            tower_stop_token_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                        #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                        post_processing_cell = TacotronEncoderCell(
                            EncoderConvolutions(
                                is_training,
                                hparams=hp,
                                scope='post_processing_convolutions'),
                            EncoderRNN(is_training,
                                       size=hp.encoder_lstm_units,
                                       zoneout=hp.tacotron_zoneout_rate,
                                       scope='post_processing_LSTM'))

                        expand_outputs = post_processing_cell(mel_outputs)
                        linear_outputs = FrameProjection(
                            hp.num_freq,
                            scope='post_processing_projection')(expand_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
                        self.tower_linear_targets.append(linear_targets)
                    log('initialiized done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.num_gpus + hp.gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.towerlinear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))
Ejemplo n.º 8
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
        """
        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_lstm(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = AttentionWrapper(  # [N, T_in, attention_depth=256]
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth),
                                     is_training, hp.prenet_depths),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    concat_cell,
                    LSTMBlockCell(hp.decoder_gru_units),
                    LSTMBlockCell(hp.decoder_gru_units)
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  Train mode:              {}'.format(is_training))
            log('  GTA mode:                {}'.format(is_training))
            log('  Embedding:               {}'.format(
                embedded_inputs.shape[-1]))
            log('  Encoder out:             {}'.format(
                encoder_outputs.shape[-1]))
            log('  Attention out:           {}'.format(
                attention_cell.output_size))
            log('  Concat attn & out:       {}'.format(
                concat_cell.output_size))
            log('  Decoder cell out:        {}'.format(
                decoder_cell.output_size))
            log('  Decoder out ({} frames):  {}'.format(
                hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  PML out:                 {}'.format(pml_outputs.shape[-1]))
Ejemplo n.º 9
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   gta=False,
                   reference_mel=None):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if gta == False and self._hparams.predict_linear == True and linear_targets is None:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
            if hp.use_vae:
                if is_training:
                    reference_mel = mel_targets

                style_embeddings, mu, log_var = VAE(inputs=reference_mel,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                    EncoderConvolutions(is_training,
                                        kernel_size=hp.enc_conv_kernel_size,
                                        channels=hp.enc_conv_channels,
                                        scope='post_processing_convolutions'),
                    EncoderRNN(is_training,
                               size=hp.encoder_lstm_units,
                               zoneout=hp.tacotron_zoneout_rate,
                               scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(
                    hp.num_freq,
                    scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.reference_mel = reference_mel
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Ejemplo n.º 10
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   eal=False,
                   locked_alignments=None,
                   logs_enabled=True,
                   flag_trainAlign=False,
                   flag_trainJoint=False,
                   alignScale=1.0,
                   flag_online_eal_eval=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments
        self.flag_trainAlign = flag_trainAlign
        self.flag_trainJoint = flag_trainJoint
        self.alignScale = alignScale
        self.flag_online_eal = (
            eal and (locked_alignments is None)) or flag_online_eal_eval

        if locked_alignments_ is not None:
            if is_training and eal:
                pass
            elif np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper',
                flag_trainAlign=self.flag_trainAlign,
                flag_trainJoint=self.flag_trainJoint
            )  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            elif eal:
                if self.flag_online_eal:
                    helper_gta = TacoTrainingHelper(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
                    helper_eal = TacoTrainingHelper_EAL(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step)
                else:
                    helper = TacoTrainingHelper_EAL(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
            elif hp.scheduled_sampling:
                helper = TacoScheduledOutputTrainingHelper(
                    inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step,
                    hp.scheduled_sampling_probability)
            else:
                if is_training:
                    log('For training, one of these should be true: gta, eal, hp.scheduled_sampling'
                        )
                else:
                    helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                            hp.outputs_per_step)
                    if flag_online_eal_eval:
                        helper_gta = helper
                        helper_eal = helper

            if not self.flag_online_eal:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

            else:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper_gta, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                locked_alignments_ = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

        with tf.variable_scope('inference_eal') as scope:
            if self.flag_online_eal:
                # Embeddings
                embedding_table_eal = tf.get_variable(
                    'embedding', [len(symbols), hp.embed_depth],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                embedded_inputs_eal = tf.nn.embedding_lookup(
                    embedding_table_eal, inputs)  # [N, T_in, embed_depth=256]

                # Encoder
                prenet_outputs_eal = prenet(
                    embedded_inputs_eal, is_training,
                    hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
                encoder_outputs_eal = encoder_cbhg(
                    prenet_outputs_eal,
                    input_lengths,
                    is_training,  # [N, T_in, encoder_depth=256]
                    hp.encoder_depth)

                #                 import pdb; pdb.set_trace()
                #                 tf.get_variable_scope().reuse_variables()
                # Attention
                #                 tmp = None if flag_online_eal_eval else locked_alignments_
                if flag_online_eal_eval: locked_alignments_ = None

                attention_cell_eal = LockableAttentionWrapper(
                    GRUCell(hp.attention_depth),
                    LocationSensitiveAttention(hp.attention_depth,
                                               encoder_outputs_eal),
                    alignment_history=True,
                    locked_alignments=locked_alignments_,
                    output_attention=False,
                    name='attention_wrapper',
                    flag_trainAlign=self.flag_trainAlign,
                    flag_trainJoint=self.flag_trainJoint
                )  # [N, T_in, attention_depth=256]

                # Apply prenet before concatenation in AttentionWrapper.
                prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal,
                                                       is_training,
                                                       hp.prenet_depths)

                # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
                concat_cell_eal = ConcatOutputAndAttentionWrapper(
                    prenet_cell_eal)  # [N, T_in, 2*attention_depth=512]

                # Decoder (layers specified bottom to top):
                decoder_cell_eal = MultiRNNCell(
                    [
                        OutputProjectionWrapper(concat_cell_eal,
                                                hp.decoder_depth),
                        ResidualWrapper(GRUCell(hp.decoder_depth)),
                        ResidualWrapper(GRUCell(hp.decoder_depth))
                    ],
                    state_is_tuple=True)  # [N, T_in, decoder_depth=256]

                # Project onto r PML feature vectors (predict r outputs at each RNN step):
                output_cell_eal = OutputProjectionWrapper(
                    decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step)
                decoder_init_state_eal = output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                (
                    decoder_outputs_eal, _
                ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell_eal, helper_eal,
                                 decoder_init_state_eal),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates_eal = tf.reshape(
                    decoder_outputs_eal,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs_eal = post_cbhg(
                    pml_intermediates_eal,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs_eal = tf.layers.dense(
                    post_outputs_eal, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state_eal[0].alignment_history.stack(),
                    [1, 2, 0])

                self.pml_intermediates_eal = pml_intermediates_eal
                self.pml_outputs_eal = pml_outputs_eal

        with tf.variable_scope('inference') as scope:
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell
            self.locked_alignments = locked_alignments_

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(gta))
                log('  EAL mode:                {}'.format(eal))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Prenet out:              {}'.format(
                    prenet_outputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Postnet out:             {}'.format(
                    post_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
Ejemplo n.º 11
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Ejemplo n.º 12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   targets_lengths=None,
                   global_step=None,
                   is_training=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        is_training=is_training,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (is_training) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    if hp.clip_outputs:
                        decoder_output = tf.minimum(
                            tf.maximum(
                                decoder_output,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if hp.clip_outputs:
                        mel_outputs = tf.minimum(
                            tf.maximum(
                                mel_outputs,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Ejemplo n.º 13
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False, reference_mel=None):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no mel targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
			post_condition = hp.predict_linear and not gta

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			if hp.use_gst:
				#Global style tokens (GST)
				gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 
					dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5))
				self.gst_tokens = gst_tokens


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
			
			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape

			if is_training:
				reference_mel = mel_targets

			if reference_mel is not None:
				# Reference encoder
				refnet_outputs = reference_encoder(
				  reference_mel, 
				  filters=hp.reference_filters, 
				  kernel_size=(3,3),
				  strides=(2,2),
				  encoder_cell=GRUCell(hp.reference_depth),
				  is_training=is_training)                                                 # [N, 128]
				self.refnet_outputs = refnet_outputs

				if hp.use_gst:
				  # Style attention
				  style_attention = MultiheadAttention(
					tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
					tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
					num_heads=hp.num_heads,
					num_units=hp.style_att_dim,
					attention_type=hp.style_att_type)

				  style_embeddings = style_attention.multi_head_attention() 
				else:
				  style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
			else:
				print("Use random weight for GST.")
				random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
				random_weights = tf.nn.softmax(random_weights, name="random_weights")
				style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
				style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])
			

			#Extend style embeddings to be compatible with encoder_outputs. 
			#Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes.
			#Preserves effect of both style and encoder_outputs.
			neg = tf.add(style_embeddings, tf.negative(style_embeddings))
			style_embeddings = tf.concat([style_embeddings, neg], axis=-1)


			# Add style embedding to every text encoder state
			style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
			encoder_outputs = tf.add(encoder_outputs, style_embeddings)   

			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
				cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells

			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection)
			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry 
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

		
			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_output)

			#Project residual to same dimension as mel spectrogram 
			#==> [batch_size, decoder_steps * r, num_mels]
			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual


			if post_condition:
				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
				post_processing_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))

				expand_outputs = post_processing_cell(mel_outputs)
				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.style_embeddings = style_embeddings
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			if post_condition:
				self.linear_outputs = linear_outputs
				self.linear_targets = linear_targets
			self.mel_targets = mel_targets
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  style embedding:         %d' % style_embeddings.shape[-1])
			log('  mel out:                  {}'.format(mel_outputs.shape))
			if post_condition:
				log('  linear out:               {}'.format(linear_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
    def initialize(self,
                   inputs,
                   input_speaker_id,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            # embedding_table = tf.get_variable(
            # 	'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Speaker Embeddings ==> [batch_size, embedding_dim]
            self.speaker_id_embedding_table = tf.get_variable(
                'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_speaker_id = tf.nn.embedding_lookup(
                self.speaker_id_embedding_table, input_speaker_id)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))
            print('inputs:', inputs)
            # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9)
            encoder_outputs = encoder_cell(inputs, input_lengths)

            #first change encoder_outputs to concated version.

            #second add. need same dims
            #encoder_outputs = encoder_outputs + embedded_speaker_id
            '''
			#first concat.
			input_seq_len = tf.shape(encoder_outputs)[1]
			print('!!!!!!!!!!before tile')
			embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1)
			embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1])
			print('!!!!!!!!!!after tile')
			id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1)
			'''
            id_encoder_outputs = encoder_outputs
            #still use encoder_outputs

            #use keras version, but not run.
            '''
			print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh')
			hp_lambda = 1.0
			Flip = GradientReversal(hp_lambda)
			Flip_encoder_outputs = Flip(encoder_outputs)
			'''
            '''
			#use tensorflow version, but star's is only 5 and i don't understand.
			Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0)
			print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs))
			densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu)
			softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax)
			
			long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1])
			tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]])
			print('tiled_speaker_id', tiled_speaker_id)
			one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num)
			print('one_hot_speaker_id', one_hot_speaker_id)
			#self.one_hot_speaker_id and self.softmax_encoder_outputs is at below
			#long_speaker_id = tf.expand_dims(long_speaker_id, axis=2)
			#dann_out = Dense(2)(dann_in)
			#Flip_encoder_outputs = 
			'''
            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                id_encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                post_cbhg = CBHG(hp.cbhg_kernels,
                                 hp.cbhg_conv_channels,
                                 hp.cbhg_pool_size,
                                 [hp.cbhg_projection, hp.num_mels],
                                 hp.cbhg_projection_kernel_size,
                                 hp.cbhg_highwaynet_layers,
                                 hp.cbhg_highway_units,
                                 hp.cbhg_rnn_units,
                                 hp.batch_norm_position,
                                 is_training,
                                 name='CBHG_postnet')

                #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                post_outputs = post_cbhg(mel_outputs, None)

                #Linear projection of extracted features to make linear spectrogram
                linear_specs_projection = FrameProjection(
                    hp.num_freq, scope='cbhg_linear_specs_projection')

                #[batch_size, decoder_steps(linear_frames), num_freq]
                linear_outputs = linear_specs_projection(post_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_speaker_id = input_speaker_id
            #self.one_hot_speaker_id and self.softmax_encoder_outputs
            #self.softmax_encoder_outputs = softmax_encoder_outputs
            #self.one_hot_speaker_id = one_hot_speaker_id
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  id encoder out:              {}'.format(
                id_encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Ejemplo n.º 15
0
	def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False):
		"""
		Initializes the model for inference

		sets "feature_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are
			entries in the spectrogram. Only needed for training.
		"""
		if feature_targets is None and stop_token_targets is not None:
			raise ValueError('no feature targets were provided but token_targets were given')
		if feature_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape


			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths,
				smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells
			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding)
			decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection)


			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, target_depth, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth]
			decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth])
			stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1])


			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_outputs)

			#Project residual to same dimension as target depth
			#==> [batch_size, decoder_steps * r, target_depth]
			residual_projection = FrameProjection(target_depth, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the final outputs
			final_outputs = decoder_outputs + projected_residual

			#Compute each feature outputs
			mgc_idx = 0
			lf0_idx = mgc_idx + hp.num_mgc
			vuv_idx = lf0_idx + hp.num_lf0
			bap_idx = vuv_idx + hp.num_vuv
			mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs')
			lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0])
			lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs')
			vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs')
			bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs')


			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments')

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_outputs = decoder_outputs
			self.final_outputs = final_outputs
			self.feature_targets = feature_targets
			self.alignments = alignments
			self.stop_token_outputs = stop_token_outputs
			self.stop_token_targets = stop_token_targets
			self.lf0_outputs = lf0_outputs
			self.mgc_outputs = mgc_outputs
			self.vuv_outputs = vuv_outputs
			self.bap_outputs = bap_outputs
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_outputs.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  final out:                {}'.format(final_outputs.shape))
			log('  lf0 out:                  {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape))
			log('  mgc out:                  {}'.format(mgc_outputs.shape))
			log('  vuv out:                  {}'.format(vuv_outputs.shape))
			log('  bap out:                  {}'.format(bap_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_outputs.shape))
    def initialize(self, inputs, speaker, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, reference_mels=None, \
     reference_lengths=None, global_step=None, is_training=False, is_evaluating=False, split_infos=None,Lf0=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- speaker: int32 tensor with shape [N]
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_speaker = tf.split(speaker,
                                     num_or_size_splits=hp.tacotron_num_gpus,
                                     axis=0)
            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths
            tower_reference_lengths = tf.split(
                reference_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if reference_lengths is not None else reference_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_float)
            p_Lf0 = tf.py_func(split_func, [Lf0, split_infos[:, 5]],
                               lout_float)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets
            p_reference_mels = tf.py_func(
                split_func, [reference_mels, split_infos[:, 4]],
                lout_float) if reference_mels is not None else reference_mels

            tower_inputs = []
            tower_Lf0 = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []
            tower_reference_mels = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(
                    tf.reshape(p_inputs[i], [batch_size, -1, 345]))
                tower_Lf0.append(tf.reshape(
                    p_Lf0[i], [batch_size, -1, 2
                               ]))  #2020.7.9.16:15 Notice! 2dims!!!here!
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))
                if p_reference_mels is not None:
                    tower_reference_mels.append(
                        tf.reshape(p_reference_mels[i],
                                   [batch_size, -1, mel_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.styleembedding = None
        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    #self.embedding_Dense = tf.layers.Dense(units=hp.embedding_dim, activation=tf.nn.relu, name='emb_Dense')
                    # self.embedding_table = tf.get_variable(
                    # 	'inputs_embedding', [hp.PPGs_length, hp.embedding_dim], dtype=tf.float32)
                    # embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
                    #embedded_inputs = self.embedding_Dense(tower_inputs[i])
                    embedded_inputs = tower_inputs[i]
                    Lf0s = tower_Lf0[i]
                    if hp.use_multispeaker:
                        self.speaker_embedding_table = tf.get_variable(
                            'speaker_embedding',
                            [hp.speaker_num, hp.speaker_dim],
                            dtype=tf.float32)
                        speaker_embedding = tf.nn.embedding_lookup(
                            self.speaker_embedding_table, tower_speaker[i])
                        self.speaker_embedding = speaker_embedding

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder = TacotronEncoder(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder(embedded_inputs,
                                              tower_input_lengths[i])

                    #F0-consists
                    encoder_outputs = tf.concat([encoder_outputs, Lf0s],
                                                axis=-1)
                    self.z_embedding = None

                    if hp.use_style_encoder:
                        # Reference input encoder.
                        reference_embeddings = None
                        if reference_mels is not None:
                            reference_encoder = ReferenceEncoder(
                                hp, is_training, scope='reference_encoder')
                            reference_embeddings = reference_encoder(
                                tower_reference_mels[i])

                        if hp.style_encoder_type == 'gst':
                            style_encoder = GstEncoder(hp,
                                                       is_training,
                                                       scope='gst_encoder')
                            style_outputs = style_encoder(reference_embeddings)
                        elif hp.style_encoder_type == 'vae':
                            style_encoder = VaeEncoder(hp,
                                                       is_training,
                                                       scope='vae_encoder')
                            style_outputs = style_encoder(
                                reference_embeddings, batch_size)
                            self.z_mu = style_outputs['z_mu']
                            self.z_log_var = style_outputs['z_log_var']
                            self.z_embedding = style_outputs['z_embedding']
                        else:
                            raise ValueError(
                                "Only supported gst and vae and cvae!")

                        style_embeddings = style_outputs[
                            'style_embedding']  #[N,1,style_embed_depth]
                        self.styleembedding = style_embeddings
                        if hp.concat_style:
                            style_embeddings = tf.tile(
                                style_embeddings,
                                [1, tf.shape(tower_inputs[i])[1], 1])
                            encoder_outputs = tf.concat(
                                [encoder_outputs, style_embeddings], axis=-1)
                        else:
                            # encoder_outputs += tf.nn.tanh(style_embeddings)#
                            #修改成concat
                            boraded_style = tf.tile(
                                style_embeddings,
                                [1, tf.shape(encoder_outputs)[1], 1])

                            encoder_outputs = tf.concat(
                                [encoder_outputs, boraded_style], axis=-1)
                    if hp.use_multispeaker:
                        speaker_embedding = tf.expand_dims(speaker_embedding,
                                                           axis=1)
                        speaker_embedding = tf.tile(
                            speaker_embedding,
                            [1, tf.shape(tower_inputs[i])[1], 1])
                        encoder_outputs = tf.concat(
                            [encoder_outputs, speaker_embedding], axis=-1)

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(hp, is_training, scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveSoftAttention(
                        hp.attention_dim, encoder_outputs,
                        tf.reshape(tower_input_lengths[i], [-1]))

                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(
                        prenet, attention_mechanism, decoder_lstm,
                        frame_projection, stop_projection, self.z_embedding)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(hparams=hp,
                                      training=is_training,
                                      output_size=hp.num_mels,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, num_mels]
                    residual = postnet(decoder_output)
                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_speaker = tower_speaker
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets
        self.tower_reference_mels = tower_reference_mels
        self.tower_reference_lengths = tower_reference_lengths

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))