Python TacoTrainingHelper Examples

Programming Language: Python

Namespace/Package Name: tacotron.models.helpers

Examples at hotexamples.com: 16

Python TacoTrainingHelper - 16 examples found. These are the top rated real world Python examples of tacotron.models.helpers.TacoTrainingHelper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TacoTrainingHelper(20)

Frequently Used Methods

TacoTrainingHelper (20)

Example #1

Show file

File: tacotron.py Project: silifor/taco2swe

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                hparams=hp,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets,
                                                 stop_token_targets, hp, gta,
                                                 is_evaluating, global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG:
                post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                         is_training)  # [N, T_out, 256]
                linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)
            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))

Example #2

Show file

File: tacotron_simpl.py Project: qingyundou/tacotron_qdou

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_gru(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                gru_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            cells = [
                GRUCell(hp.decoder_gru_units)
                for _ in range(hp.decoder_gru_layers)
            ]
            decoder_cell = MultiRNNCell(
                [concat_cell] + cells,
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                if hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and GRU layer:
            expand_outputs = conv_and_gru(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                gru_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(is_training))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Expand out:              {}'.format(
                    expand_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))

Example #3

Show file

File: tacotron - Copy (2).py Project: JackCChen2017/CS5242_project

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        #split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx)
        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:1'
        print("debug checkpoint gpus:", split_device)
        #with tf.device(split_device):
        with tf.device("/cpu:0"):  # debug, gpu:0 will use about 192MB memory
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        #gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)]
        gpus = ["/gpu:1"]
        print("debug checkpoint gpus:", gpus)
        for i in range(hp.tacotron_num_gpus):
            #	with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/cpu:0")
            ):  # debug using gpu:0 will cause OOM, use >1640MB
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape
                    print("debug enc_conv_output_shape:",
                          enc_conv_output_shape)

            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/cpu:0")
            ):  # debug try to use as more gpu as possible
                with tf.variable_scope('inference') as scope:

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device="/gpu:0")
            ):  # debug try to use as more gpu as possible
                with tf.variable_scope('inference') as scope:

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))

Example #4

Show file

File: tacotron_gmm.py Project: zzyuanhua/tacotronv2_wavernn_chinese

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):

        hp = self._hparams
        batch_size = tf.shape(inputs)[0]
        gta = False

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        with tf.variable_scope('inference') as scope:
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            self.embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32)

            embedded_inputs = tf.nn.embedding_lookup(self.embedding_table,
                                                     inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            self.enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = GMMAttention(self.encoder_outputs,
                                               input_lengths, is_training)

            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               num_attn_mixture=5)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            self.decoder_output = tf.reshape(frames_prediction,
                                             [batch_size, -1, hp.num_mels])
            self.stop_token_prediction = tf.reshape(stop_token_prediction,
                                                    [batch_size, -1])

            if hp.clip_outputs:
                self.decoder_output = tf.minimum(
                    tf.maximum(self.decoder_output,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(self.decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            self.projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            self.mel_outputs = self.decoder_output + self.projected_residual

            if hp.clip_outputs:
                self.mel_outputs = tf.minimum(
                    tf.maximum(self.mel_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
            post_cbhg = CBHG(hp.cbhg_kernels,
                             hp.cbhg_conv_channels,
                             hp.cbhg_pool_size,
                             [hp.cbhg_projection, hp.num_mels],
                             hp.cbhg_projection_kernel_size,
                             hp.cbhg_highwaynet_layers,
                             hp.cbhg_highway_units,
                             hp.cbhg_rnn_units,
                             hp.batch_norm_position,
                             is_training,
                             name='CBHG_postnet')

            #[batch_size, decoder_steps(mel_frames), cbhg_channels]
            self.post_outputs = post_cbhg(self.mel_outputs, None)

            #Linear projection of extracted features to make linear spectrogram
            linear_specs_projection = FrameProjection(
                hp.num_freq, scope='cbhg_linear_specs_projection')

            #[batch_size, decoder_steps(linear_frames), num_freq]
            self.linear_outputs = linear_specs_projection(self.post_outputs)

            if hp.clip_outputs:
                self.linear_outputs = tf.minimum(
                    tf.maximum(self.linear_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Grab alignments from the final decoder state
            self.alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            log('initialisation done.')

        if is_training:
            self.ratio = self.helper._ratio

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        self.targets_lengths = targets_lengths
        self.stop_token_targets = stop_token_targets
        self.gta = gta
        self.all_vars = tf.trainable_variables()
        self.is_training = is_training
        self.is_evaluating = is_evaluating

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        log('  embedding:                {}'.format(embedded_inputs.shape))
        log('  enc conv out:             {}'.format(
            self.enc_conv_output_shape))
        log('  encoder out:              {}'.format(
            self.encoder_outputs.shape))
        log('  decoder out:              {}'.format(self.decoder_output.shape))
        log('  residual out:             {}'.format(residual.shape))
        log('  projected residual out:   {}'.format(
            self.projected_residual.shape))
        log('  mel out:                  {}'.format(self.mel_outputs.shape))
        log('  linear out:               {}'.format(self.linear_outputs.shape))
        log('  <stop_token> out:         {}'.format(
            self.stop_token_prediction.shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))

Example #5

Show file

File: tacotron.py Project: agublazer/final_ia

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None):

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=(5, ),
                                    channels=512,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            # Define elements for decoder
            prenet = Prenet(is_training,
                            layer_sizes=[256, 256],
                            scope='decoder_prenet')
            # Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            # Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            # Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            # <stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            if is_training is True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            max_iters = hp.max_iters if not is_training else None

            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            # Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            # Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            # Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            # Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets

Example #6

Show file

File: tacotron_pml_x.py Project: qingyundou/tacotron_qdou

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
      pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
        steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
        features. Only needed for training.
      gta: boolean flag that is set to True when ground truth alignment is required
      locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
        parameter and the attention alignments are locked to these values
      logs_enabled: boolean flag that defaults to True, if False no construction logs output
    '''
        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                pml_intermediates,
                hp.pml_dimension,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            pml_outputs = tf.layers.dense(post_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % pml_intermediates.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  pml out:                 %d' % pml_outputs.shape[-1])

Example #7

Show file

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   split_infos=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.device('/cpu:0'):
            hp = self._hparams
            lout_int = [tf.int32] * hp.num_gpus
            lout_float = [tf.float32] * hp.num_gpus

            tower_input_lengths = tf.split(input_lengths,
                                           num_or_size_splits=hp.num_gpus,
                                           axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths, num_or_size_splits=hp.num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(split_func,
                                       [mel_targets, split_infos[:, 1]],
                                       lout_float)
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                tower_mel_targets.append(
                    tf.reshape(p_mel_targets[i],
                               [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.tower_linear_targets = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus)
        ]
        for i in range(hp.num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tower_input_lengths[i],
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_lstm')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i],
                            tower_stop_token_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                        #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                        post_processing_cell = TacotronEncoderCell(
                            EncoderConvolutions(
                                is_training,
                                hparams=hp,
                                scope='post_processing_convolutions'),
                            EncoderRNN(is_training,
                                       size=hp.encoder_lstm_units,
                                       zoneout=hp.tacotron_zoneout_rate,
                                       scope='post_processing_LSTM'))

                        expand_outputs = post_processing_cell(mel_outputs)
                        linear_outputs = FrameProjection(
                            hp.num_freq,
                            scope='post_processing_projection')(expand_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
                        self.tower_linear_targets.append(linear_targets)
                    log('initialiized done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.num_gpus + hp.gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.towerlinear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

Example #8

Show file

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
        """
        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_lstm(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = AttentionWrapper(  # [N, T_in, attention_depth=256]
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth),
                                     is_training, hp.prenet_depths),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    concat_cell,
                    LSTMBlockCell(hp.decoder_gru_units),
                    LSTMBlockCell(hp.decoder_gru_units)
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  Train mode:              {}'.format(is_training))
            log('  GTA mode:                {}'.format(is_training))
            log('  Embedding:               {}'.format(
                embedded_inputs.shape[-1]))
            log('  Encoder out:             {}'.format(
                encoder_outputs.shape[-1]))
            log('  Attention out:           {}'.format(
                attention_cell.output_size))
            log('  Concat attn & out:       {}'.format(
                concat_cell.output_size))
            log('  Decoder cell out:        {}'.format(
                decoder_cell.output_size))
            log('  Decoder out ({} frames):  {}'.format(
                hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  PML out:                 {}'.format(pml_outputs.shape[-1]))

Example #9

Show file

File: tacotron.py Project: gilmoore/VAE_Tacotron2

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   gta=False,
                   reference_mel=None):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if gta == False and self._hparams.predict_linear == True and linear_targets is None:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
            if hp.use_vae:
                if is_training:
                    reference_mel = mel_targets

                style_embeddings, mu, log_var = VAE(inputs=reference_mel,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                    EncoderConvolutions(is_training,
                                        kernel_size=hp.enc_conv_kernel_size,
                                        channels=hp.enc_conv_channels,
                                        scope='post_processing_convolutions'),
                    EncoderRNN(is_training,
                               size=hp.encoder_lstm_units,
                               zoneout=hp.tacotron_zoneout_rate,
                               scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(
                    hp.num_freq,
                    scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.reference_mel = reference_mel
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))

Example #10

Show file

File: tacotron_online.py Project: qingyundou/tacotron_qdou

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   eal=False,
                   locked_alignments=None,
                   logs_enabled=True,
                   flag_trainAlign=False,
                   flag_trainJoint=False,
                   alignScale=1.0,
                   flag_online_eal_eval=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments
        self.flag_trainAlign = flag_trainAlign
        self.flag_trainJoint = flag_trainJoint
        self.alignScale = alignScale
        self.flag_online_eal = (
            eal and (locked_alignments is None)) or flag_online_eal_eval

        if locked_alignments_ is not None:
            if is_training and eal:
                pass
            elif np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper',
                flag_trainAlign=self.flag_trainAlign,
                flag_trainJoint=self.flag_trainJoint
            )  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            elif eal:
                if self.flag_online_eal:
                    helper_gta = TacoTrainingHelper(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
                    helper_eal = TacoTrainingHelper_EAL(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step)
                else:
                    helper = TacoTrainingHelper_EAL(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
            elif hp.scheduled_sampling:
                helper = TacoScheduledOutputTrainingHelper(
                    inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step,
                    hp.scheduled_sampling_probability)
            else:
                if is_training:
                    log('For training, one of these should be true: gta, eal, hp.scheduled_sampling'
                        )
                else:
                    helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                            hp.outputs_per_step)
                    if flag_online_eal_eval:
                        helper_gta = helper
                        helper_eal = helper

            if not self.flag_online_eal:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

            else:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper_gta, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                locked_alignments_ = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

        with tf.variable_scope('inference_eal') as scope:
            if self.flag_online_eal:
                # Embeddings
                embedding_table_eal = tf.get_variable(
                    'embedding', [len(symbols), hp.embed_depth],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                embedded_inputs_eal = tf.nn.embedding_lookup(
                    embedding_table_eal, inputs)  # [N, T_in, embed_depth=256]

                # Encoder
                prenet_outputs_eal = prenet(
                    embedded_inputs_eal, is_training,
                    hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
                encoder_outputs_eal = encoder_cbhg(
                    prenet_outputs_eal,
                    input_lengths,
                    is_training,  # [N, T_in, encoder_depth=256]
                    hp.encoder_depth)

                #                 import pdb; pdb.set_trace()
                #                 tf.get_variable_scope().reuse_variables()
                # Attention
                #                 tmp = None if flag_online_eal_eval else locked_alignments_
                if flag_online_eal_eval: locked_alignments_ = None

                attention_cell_eal = LockableAttentionWrapper(
                    GRUCell(hp.attention_depth),
                    LocationSensitiveAttention(hp.attention_depth,
                                               encoder_outputs_eal),
                    alignment_history=True,
                    locked_alignments=locked_alignments_,
                    output_attention=False,
                    name='attention_wrapper',
                    flag_trainAlign=self.flag_trainAlign,
                    flag_trainJoint=self.flag_trainJoint
                )  # [N, T_in, attention_depth=256]

                # Apply prenet before concatenation in AttentionWrapper.
                prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal,
                                                       is_training,
                                                       hp.prenet_depths)

                # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
                concat_cell_eal = ConcatOutputAndAttentionWrapper(
                    prenet_cell_eal)  # [N, T_in, 2*attention_depth=512]

                # Decoder (layers specified bottom to top):
                decoder_cell_eal = MultiRNNCell(
                    [
                        OutputProjectionWrapper(concat_cell_eal,
                                                hp.decoder_depth),
                        ResidualWrapper(GRUCell(hp.decoder_depth)),
                        ResidualWrapper(GRUCell(hp.decoder_depth))
                    ],
                    state_is_tuple=True)  # [N, T_in, decoder_depth=256]

                # Project onto r PML feature vectors (predict r outputs at each RNN step):
                output_cell_eal = OutputProjectionWrapper(
                    decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step)
                decoder_init_state_eal = output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                (
                    decoder_outputs_eal, _
                ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell_eal, helper_eal,
                                 decoder_init_state_eal),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates_eal = tf.reshape(
                    decoder_outputs_eal,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs_eal = post_cbhg(
                    pml_intermediates_eal,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs_eal = tf.layers.dense(
                    post_outputs_eal, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state_eal[0].alignment_history.stack(),
                    [1, 2, 0])

                self.pml_intermediates_eal = pml_intermediates_eal
                self.pml_outputs_eal = pml_outputs_eal

        with tf.variable_scope('inference') as scope:
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell
            self.locked_alignments = locked_alignments_

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(gta))
                log('  EAL mode:                {}'.format(eal))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Prenet out:              {}'.format(
                    prenet_outputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Postnet out:             {}'.format(
                    post_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))

Example #11

Show file

File: tacotron.py Project: jinguang-dong/Tacotron-2

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))

Example #12

Show file

File: tacotron.py Project: GengWu-JLU/AIBigdata

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   targets_lengths=None,
                   global_step=None,
                   is_training=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        is_training=is_training,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (is_training) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    if hp.clip_outputs:
                        decoder_output = tf.minimum(
                            tf.maximum(
                                decoder_output,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if hp.clip_outputs:
                        mel_outputs = tf.minimum(
                            tf.maximum(
                                mel_outputs,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))

Example #13

Show file

File: tacotron.py Project: ruclion/gst_tacotron2_wavenet

	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False, reference_mel=None):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no mel targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
			post_condition = hp.predict_linear and not gta

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			if hp.use_gst:
				#Global style tokens (GST)
				gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 
					dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5))
				self.gst_tokens = gst_tokens


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
			
			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape

			if is_training:
				reference_mel = mel_targets

			if reference_mel is not None:
				# Reference encoder
				refnet_outputs = reference_encoder(
				  reference_mel, 
				  filters=hp.reference_filters, 
				  kernel_size=(3,3),
				  strides=(2,2),
				  encoder_cell=GRUCell(hp.reference_depth),
				  is_training=is_training)                                                 # [N, 128]
				self.refnet_outputs = refnet_outputs

				if hp.use_gst:
				  # Style attention
				  style_attention = MultiheadAttention(
					tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
					tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
					num_heads=hp.num_heads,
					num_units=hp.style_att_dim,
					attention_type=hp.style_att_type)

				  style_embeddings = style_attention.multi_head_attention() 
				else:
				  style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
			else:
				print("Use random weight for GST.")
				random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
				random_weights = tf.nn.softmax(random_weights, name="random_weights")
				style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
				style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])
			

			#Extend style embeddings to be compatible with encoder_outputs. 
			#Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes.
			#Preserves effect of both style and encoder_outputs.
			neg = tf.add(style_embeddings, tf.negative(style_embeddings))
			style_embeddings = tf.concat([style_embeddings, neg], axis=-1)


			# Add style embedding to every text encoder state
			style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
			encoder_outputs = tf.add(encoder_outputs, style_embeddings)   

			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
				cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells

			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection)
			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry 
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

		
			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_output)

			#Project residual to same dimension as mel spectrogram 
			#==> [batch_size, decoder_steps * r, num_mels]
			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual


			if post_condition:
				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
				post_processing_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))

				expand_outputs = post_processing_cell(mel_outputs)
				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.style_embeddings = style_embeddings
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			if post_condition:
				self.linear_outputs = linear_outputs
				self.linear_targets = linear_targets
			self.mel_targets = mel_targets
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  style embedding:         %d' % style_embeddings.shape[-1])
			log('  mel out:                  {}'.format(mel_outputs.shape))
			if post_condition:
				log('  linear out:               {}'.format(linear_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))

Example #14

Show file

File: tacotron.py Project: silifor/linears_decoder_tacotron-2-joee

    def initialize(self,
                   inputs,
                   input_speaker_id,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            # embedding_table = tf.get_variable(
            # 	'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Speaker Embeddings ==> [batch_size, embedding_dim]
            self.speaker_id_embedding_table = tf.get_variable(
                'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_speaker_id = tf.nn.embedding_lookup(
                self.speaker_id_embedding_table, input_speaker_id)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))
            print('inputs:', inputs)
            # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9)
            encoder_outputs = encoder_cell(inputs, input_lengths)

            #first change encoder_outputs to concated version.

            #second add. need same dims
            #encoder_outputs = encoder_outputs + embedded_speaker_id
            '''
			#first concat.
			input_seq_len = tf.shape(encoder_outputs)[1]
			print('!!!!!!!!!!before tile')
			embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1)
			embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1])
			print('!!!!!!!!!!after tile')
			id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1)
			'''
            id_encoder_outputs = encoder_outputs
            #still use encoder_outputs

            #use keras version, but not run.
            '''
			print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh')
			hp_lambda = 1.0
			Flip = GradientReversal(hp_lambda)
			Flip_encoder_outputs = Flip(encoder_outputs)
			'''
            '''
			#use tensorflow version, but star's is only 5 and i don't understand.
			Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0)
			print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs))
			densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu)
			softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax)
			
			long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1])
			tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]])
			print('tiled_speaker_id', tiled_speaker_id)
			one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num)
			print('one_hot_speaker_id', one_hot_speaker_id)
			#self.one_hot_speaker_id and self.softmax_encoder_outputs is at below
			#long_speaker_id = tf.expand_dims(long_speaker_id, axis=2)
			#dann_out = Dense(2)(dann_in)
			#Flip_encoder_outputs = 
			'''
            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                id_encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                post_cbhg = CBHG(hp.cbhg_kernels,
                                 hp.cbhg_conv_channels,
                                 hp.cbhg_pool_size,
                                 [hp.cbhg_projection, hp.num_mels],
                                 hp.cbhg_projection_kernel_size,
                                 hp.cbhg_highwaynet_layers,
                                 hp.cbhg_highway_units,
                                 hp.cbhg_rnn_units,
                                 hp.batch_norm_position,
                                 is_training,
                                 name='CBHG_postnet')

                #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                post_outputs = post_cbhg(mel_outputs, None)

                #Linear projection of extracted features to make linear spectrogram
                linear_specs_projection = FrameProjection(
                    hp.num_freq, scope='cbhg_linear_specs_projection')

                #[batch_size, decoder_steps(linear_frames), num_freq]
                linear_outputs = linear_specs_projection(post_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_speaker_id = input_speaker_id
            #self.one_hot_speaker_id and self.softmax_encoder_outputs
            #self.softmax_encoder_outputs = softmax_encoder_outputs
            #self.one_hot_speaker_id = one_hot_speaker_id
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  id encoder out:              {}'.format(
                id_encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))

Example #15

Show file

	def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False):
		"""
		Initializes the model for inference

		sets "feature_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are
			entries in the spectrogram. Only needed for training.
		"""
		if feature_targets is None and stop_token_targets is not None:
			raise ValueError('no feature targets were provided but token_targets were given')
		if feature_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape


			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths,
				smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells
			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding)
			decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection)


			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, target_depth, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth]
			decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth])
			stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1])


			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_outputs)

			#Project residual to same dimension as target depth
			#==> [batch_size, decoder_steps * r, target_depth]
			residual_projection = FrameProjection(target_depth, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the final outputs
			final_outputs = decoder_outputs + projected_residual

			#Compute each feature outputs
			mgc_idx = 0
			lf0_idx = mgc_idx + hp.num_mgc
			vuv_idx = lf0_idx + hp.num_lf0
			bap_idx = vuv_idx + hp.num_vuv
			mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs')
			lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0])
			lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs')
			vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs')
			bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs')


			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments')

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_outputs = decoder_outputs
			self.final_outputs = final_outputs
			self.feature_targets = feature_targets
			self.alignments = alignments
			self.stop_token_outputs = stop_token_outputs
			self.stop_token_targets = stop_token_targets
			self.lf0_outputs = lf0_outputs
			self.mgc_outputs = mgc_outputs
			self.vuv_outputs = vuv_outputs
			self.bap_outputs = bap_outputs
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_outputs.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  final out:                {}'.format(final_outputs.shape))
			log('  lf0 out:                  {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape))
			log('  mgc out:                  {}'.format(mgc_outputs.shape))
			log('  vuv out:                  {}'.format(vuv_outputs.shape))
			log('  bap out:                  {}'.format(bap_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_outputs.shape))

Example #16

Show file

File: tacotron.py Project: ruclion/linears_decoder_tacotron-2-zhaoxt-tacoLinear

    def initialize(self, inputs, speaker, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, reference_mels=None, \
     reference_lengths=None, global_step=None, is_training=False, is_evaluating=False, split_infos=None,Lf0=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- speaker: int32 tensor with shape [N]
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_speaker = tf.split(speaker,
                                     num_or_size_splits=hp.tacotron_num_gpus,
                                     axis=0)
            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths
            tower_reference_lengths = tf.split(
                reference_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if reference_lengths is not None else reference_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_float)
            p_Lf0 = tf.py_func(split_func, [Lf0, split_infos[:, 5]],
                               lout_float)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets
            p_reference_mels = tf.py_func(
                split_func, [reference_mels, split_infos[:, 4]],
                lout_float) if reference_mels is not None else reference_mels

            tower_inputs = []
            tower_Lf0 = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []
            tower_reference_mels = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(
                    tf.reshape(p_inputs[i], [batch_size, -1, 345]))
                tower_Lf0.append(tf.reshape(
                    p_Lf0[i], [batch_size, -1, 2
                               ]))  #2020.7.9.16:15 Notice! 2dims!!!here!
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))
                if p_reference_mels is not None:
                    tower_reference_mels.append(
                        tf.reshape(p_reference_mels[i],
                                   [batch_size, -1, mel_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.styleembedding = None
        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    #self.embedding_Dense = tf.layers.Dense(units=hp.embedding_dim, activation=tf.nn.relu, name='emb_Dense')
                    # self.embedding_table = tf.get_variable(
                    # 	'inputs_embedding', [hp.PPGs_length, hp.embedding_dim], dtype=tf.float32)
                    # embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
                    #embedded_inputs = self.embedding_Dense(tower_inputs[i])
                    embedded_inputs = tower_inputs[i]
                    Lf0s = tower_Lf0[i]
                    if hp.use_multispeaker:
                        self.speaker_embedding_table = tf.get_variable(
                            'speaker_embedding',
                            [hp.speaker_num, hp.speaker_dim],
                            dtype=tf.float32)
                        speaker_embedding = tf.nn.embedding_lookup(
                            self.speaker_embedding_table, tower_speaker[i])
                        self.speaker_embedding = speaker_embedding

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder = TacotronEncoder(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder(embedded_inputs,
                                              tower_input_lengths[i])

                    #F0-consists
                    encoder_outputs = tf.concat([encoder_outputs, Lf0s],
                                                axis=-1)
                    self.z_embedding = None

                    if hp.use_style_encoder:
                        # Reference input encoder.
                        reference_embeddings = None
                        if reference_mels is not None:
                            reference_encoder = ReferenceEncoder(
                                hp, is_training, scope='reference_encoder')
                            reference_embeddings = reference_encoder(
                                tower_reference_mels[i])

                        if hp.style_encoder_type == 'gst':
                            style_encoder = GstEncoder(hp,
                                                       is_training,
                                                       scope='gst_encoder')
                            style_outputs = style_encoder(reference_embeddings)
                        elif hp.style_encoder_type == 'vae':
                            style_encoder = VaeEncoder(hp,
                                                       is_training,
                                                       scope='vae_encoder')
                            style_outputs = style_encoder(
                                reference_embeddings, batch_size)
                            self.z_mu = style_outputs['z_mu']
                            self.z_log_var = style_outputs['z_log_var']
                            self.z_embedding = style_outputs['z_embedding']
                        else:
                            raise ValueError(
                                "Only supported gst and vae and cvae!")

                        style_embeddings = style_outputs[
                            'style_embedding']  #[N,1,style_embed_depth]
                        self.styleembedding = style_embeddings
                        if hp.concat_style:
                            style_embeddings = tf.tile(
                                style_embeddings,
                                [1, tf.shape(tower_inputs[i])[1], 1])
                            encoder_outputs = tf.concat(
                                [encoder_outputs, style_embeddings], axis=-1)
                        else:
                            # encoder_outputs += tf.nn.tanh(style_embeddings)#
                            #修改成concat
                            boraded_style = tf.tile(
                                style_embeddings,
                                [1, tf.shape(encoder_outputs)[1], 1])

                            encoder_outputs = tf.concat(
                                [encoder_outputs, boraded_style], axis=-1)
                    if hp.use_multispeaker:
                        speaker_embedding = tf.expand_dims(speaker_embedding,
                                                           axis=1)
                        speaker_embedding = tf.tile(
                            speaker_embedding,
                            [1, tf.shape(tower_inputs[i])[1], 1])
                        encoder_outputs = tf.concat(
                            [encoder_outputs, speaker_embedding], axis=-1)

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(hp, is_training, scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveSoftAttention(
                        hp.attention_dim, encoder_outputs,
                        tf.reshape(tower_input_lengths[i], [-1]))

                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(
                        prenet, attention_mechanism, decoder_lstm,
                        frame_projection, stop_projection, self.z_embedding)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(hparams=hp,
                                      training=is_training,
                                      output_size=hp.num_mels,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, num_mels]
                    residual = postnet(decoder_output)
                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_speaker = tower_speaker
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets
        self.tower_reference_mels = tower_reference_mels
        self.tower_reference_lengths = tower_reference_lengths

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))