Exemple #1
0
    def initialize(self, inputs, input_lengths, mel_targets=None,
                   stop_token_targets=None, linear_targets=None,
                   targets_lengths=None, gta=False, global_step=None,
                   is_training=False, is_evaluating=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError('no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError('Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
            raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
        if gta and linear_targets is not None:
            raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
        if is_training and is_evaluating:
            raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            if hp.tacotron_curriculum_dropout_rate:
                assert global_step is not None
                self.dropout_rate = self._curriculum_dropout(
                    hp.tacotron_dropout_rate,
                    hp.tacotron_curriculum_dropout_gamma,
                    global_step)
            else:
                self.dropout_rate = tf.convert_to_tensor(
                    hp.tacotron_dropout_rate)

            if hp.tacotron_curriculum_zoneout_rate:
                assert global_step is not None
                self.zoneout_rate = self._curriculum_dropout(
                    hp.tacotron_zoneout_rate,
                    hp.tacotron_curriculum_zoneout_gamma,
                    global_step)
            else:
                self.zoneout_rate = tf.convert_to_tensor(
                    hp.tacotron_zoneout_rate)

            assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hp.enc_conv_kernel_size,
                                    hp.enc_conv_channels,
                                    hp.enc_conv_num_layers,
                                    self.dropout_rate,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training, size=hp.encoder_lstm_units,
                    zoneout=self.zoneout_rate, scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape


            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=self.dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
                mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
                size=hp.decoder_lstm_units, zoneout=self.zoneout_rate, scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(
                prenet,
                attention_mechanism,
                decoder_lstm,
                frame_projection,
                stop_projection)


            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                if mel_targets is not None and stop_token_targets is not None:
                    self.helper = TacoTrainingHelper(
                        batch_size, mel_targets, stop_token_targets, hp, gta,
                        is_evaluating, global_step)
                else:
                    if gta:
                        log('Warning: gta set to True but mel_targets or '
                            + 'mel_targets or stop_token_targets not provided'
                            + ', falling back to natural inference')
                    self.helper = TacoTestHelper(batch_size, hp)
            else:
                self.helper = TacoTestHelper(batch_size, hp)


            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
                CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                impute_finished=False,
                maximum_iterations=max_iters,
                swap_memory=hp.tacotron_swap_with_cpu)


            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])


            #Postnet
            postnet = Postnet(is_training,
                              hp.postnet_kernel_size,
                              hp.postnet_channels,
                              hp.postnet_num_layers,
                              self.dropout_rate,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
            projected_residual = residual_projection(residual)


            #Compute the mel spectrogram
            mel_outputs = tf.add(decoder_output, projected_residual,
                                 name='mel_outputs')


            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hp.enc_conv_kernel_size,
                                    hp.enc_conv_channels,
                                    hp.enc_conv_num_layers,
                                    self.dropout_rate,
                                    scope='post_processing_convolutions'),
                EncoderRNN(is_training, size=hp.encoder_lstm_units,
                    zoneout=self.zoneout_rate, scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0],
                name='alignments')

            self.optimize = None
            self.loss = None
            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
Exemple #2
0
    def initialize(self,
                   inputs=None,
                   input_lengths=None,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   mel_references=None,
                   references_lengths=None,
                   vae_codes=None,
                   gta=False,
                   global_step=None,
                   use_vae=False,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        self.use_vae = use_vae
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )
        if use_vae and mel_references is None and vae_codes is None:
            if mel_targets is None:
                raise ValueError(
                    'Mel targets must be provided if neither mel references nor references codes are given!'
                )
            else:
                mel_references = mel_targets
        if use_vae and references_lengths is None and vae_codes is None:
            if targets_lengths is None:
                raise ValueError(
                    'Targets lengths must be provided if neither references lengths nor references codes are given!'
                )
            else:
                references_lengths = targets_lengths

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if input_lengths is not None else input_lengths
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths
            tower_references_lengths = tf.split(
                references_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0
            ) if references_lengths is not None else references_lengths
            tower_vae_codes = tf.split(
                vae_codes, num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if vae_codes is not None else vae_codes

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int) if inputs is not None else inputs
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_mel_references = tf.py_func(
                split_func, [mel_references, split_infos[:, 1]],
                lout_float) if mel_references is not None else mel_references
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_mel_references = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(
                inputs)[0] if inputs is not None else tf.shape(
                    mel_references)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                if p_inputs is not None:
                    tower_inputs.append(
                        tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_mel_references is not None:
                    tower_mel_references.append(
                        tf.reshape(p_mel_references[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.tower_vae_codes = []
        self.tower_mu = []
        self.tower_log_var = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    #VAE Parts
                    if use_vae and inputs is None:
                        vae_code = None
                        if mel_references is not None:
                            vae_code, mu, log_var = VAE(
                                inputs=tower_mel_references[i],
                                input_lengths=tower_references_lengths[i],
                                filters=hp.vae_filters,
                                kernel_size=hp.vae_kernel,
                                stride=hp.vae_stride,
                                num_units=hp.vae_dim,
                                rnn_units=hp.vae_rnn_units,
                                bnorm=hp.batch_norm_position,
                                log_var_minimum=hp.vae_log_var_minimum,
                                is_training=is_training,
                                scope='vae')
#Use vae_codes first if provided
                        if vae_codes is not None:
                            vae_code = tower_vae_codes[i]

                        self.tower_vae_codes.append(vae_code)
                        if mel_references is not None:
                            self.tower_mu.append(mu)
                            self.tower_log_var.append(log_var)

                    if inputs is not None:
                        assert hp.tacotron_teacher_forcing_mode in (
                            'constant', 'scheduled')
                        if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                            assert global_step is not None

                        #When predicting mels to train Wavenet vocoder, post processing should be ommited
                        post_condition = hp.predict_linear

                        # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                        self.embedding_table = tf.get_variable(
                            'inputs_embedding',
                            [len(symbols), hp.embedding_dim],
                            dtype=tf.float32)
                        embedded_inputs = tf.nn.embedding_lookup(
                            self.embedding_table, tower_inputs[i])

                        #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                        encoder_cell = TacotronEncoderCell(
                            EncoderConvolutions(is_training,
                                                hparams=hp,
                                                scope='encoder_convolutions'),
                            EncoderRNN(is_training,
                                       size=hp.encoder_lstm_units,
                                       zoneout=hp.tacotron_zoneout_rate,
                                       scope='encoder_LSTM'))

                        encoder_outputs = encoder_cell(embedded_inputs,
                                                       tower_input_lengths[i])

                        #For shape visualization purpose
                        enc_conv_output_shape = encoder_cell.conv_output_shape

                        vae_code = None
                        if use_vae:
                            if mel_references is not None:
                                vae_code, mu, log_var = VAE(
                                    inputs=tower_mel_references[i],
                                    input_lengths=tower_references_lengths[i],
                                    filters=hp.vae_filters,
                                    kernel_size=hp.vae_kernel,
                                    stride=hp.vae_stride,
                                    num_units=hp.vae_dim,
                                    rnn_units=hp.vae_rnn_units,
                                    bnorm=hp.batch_norm_position,
                                    log_var_minimum=hp.vae_log_var_minimum,
                                    is_training=is_training,
                                    scope='vae')
#Use vae_codes first if provided
                            if vae_codes is not None:
                                vae_code = tower_vae_codes[i]

                        if hp.encoder_outputs_revision_type == 'add':
                            encoder_outputs = encoder_outputs + tf.tile(
                                tf.expand_dims(tf.layers.dense(
                                    vae_code,
                                    hp.encoder_lstm_units * 2,
                                    name='vae_code_projection',
                                    activation='tanh'),
                                               axis=1),
                                [1, tf.shape(encoder_outputs)[1], 1])
                        elif hp.encoder_outputs_revision_type == 'multiply':
                            encoder_outputs = tf.math.multiply(
                                encoder_outputs,
                                tf.tile(
                                    tf.expand_dims(tf.layers.dense(
                                        vae_code,
                                        hp.encoder_lstm_units * 2,
                                        name='vae_code_projection',
                                        activation='tanh'),
                                                   axis=1),
                                    [1, tf.shape(encoder_outputs)[1], 1]))
                        #Attention Decoder Prenet
                        prenet = Prenet(is_training,
                                        layers_sizes=hp.prenet_layers,
                                        drop_rate=hp.tacotron_dropout_rate,
                                        scope='decoder_prenet')
                        #Attention Mechanism
                        attention_mechanism = LocationSensitiveAttention(
                            hp.attention_dim,
                            encoder_outputs,
                            hparams=hp,
                            is_training=is_training,
                            mask_encoder=hp.mask_encoder,
                            memory_sequence_length=tf.reshape(
                                tower_input_lengths[i], [-1]),
                            smoothing=hp.smoothing,
                            cumulate_weights=hp.cumulative_weights)
                        #Decoder LSTM Cells
                        decoder_lstm = DecoderRNN(
                            is_training,
                            layers=hp.decoder_layers,
                            size=hp.decoder_lstm_units,
                            zoneout=hp.tacotron_zoneout_rate,
                            scope='decoder_LSTM')
                        #Frames Projection layer
                        frame_projection = FrameProjection(
                            hp.num_mels * hp.outputs_per_step,
                            scope='linear_transform_projection')
                        #<stop_token> projection layer
                        stop_projection = StopProjection(
                            is_training or is_evaluating,
                            shape=hp.outputs_per_step,
                            scope='stop_token_projection')

                        #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                        decoder_cell = TacotronDecoderCell(
                            prenet, attention_mechanism, decoder_lstm,
                            frame_projection, stop_projection, vae_code, hp)

                        #Define the helper for our decoder
                        if is_training or is_evaluating or gta:
                            self.helper = TacoTrainingHelper(
                                batch_size, tower_mel_targets[i], hp, gta,
                                is_evaluating, global_step)
                        else:
                            self.helper = TacoTestHelper(batch_size, hp)

                        #initial decoder state
                        decoder_init_state = decoder_cell.zero_state(
                            batch_size=batch_size, dtype=tf.float32)

                        #Only use max iterations at synthesis time
                        max_iters = hp.max_iters if not (
                            is_training or is_evaluating) else None

                        #Decode
                        (frames_prediction, stop_token_prediction,
                         _), final_decoder_state, _ = dynamic_decode(
                             CustomDecoder(decoder_cell, self.helper,
                                           decoder_init_state),
                             impute_finished=False,
                             maximum_iterations=max_iters,
                             swap_memory=hp.tacotron_swap_with_cpu)

                        # Reshape outputs to be one output per entry
                        #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                        decoder_output = tf.reshape(
                            frames_prediction, [batch_size, -1, hp.num_mels])
                        stop_token_prediction = tf.reshape(
                            stop_token_prediction, [batch_size, -1])

                        if hp.clip_outputs:
                            decoder_output = tf.minimum(
                                tf.maximum(
                                    decoder_output,
                                    T2_output_range[0] - hp.lower_bound_decay),
                                T2_output_range[1])

                        #Postnet
                        postnet = Postnet(is_training,
                                          hparams=hp,
                                          scope='postnet_convolutions')

                        #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                        residual = postnet(decoder_output)

                        #Project residual to same dimension as mel spectrogram
                        #==> [batch_size, decoder_steps * r, num_mels]
                        residual_projection = FrameProjection(
                            hp.num_mels, scope='postnet_projection')
                        projected_residual = residual_projection(residual)

                        #Compute the mel spectrogram
                        mel_outputs = decoder_output + projected_residual

                        if hp.clip_outputs:
                            mel_outputs = tf.minimum(
                                tf.maximum(
                                    mel_outputs,
                                    T2_output_range[0] - hp.lower_bound_decay),
                                T2_output_range[1])

                        if post_condition:
                            # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                            post_cbhg = CBHG(hp.cbhg_kernels,
                                             hp.cbhg_conv_channels,
                                             hp.cbhg_pool_size,
                                             [hp.cbhg_projection, hp.num_mels],
                                             hp.cbhg_projection_kernel_size,
                                             hp.cbhg_highwaynet_layers,
                                             hp.cbhg_highway_units,
                                             hp.cbhg_rnn_units,
                                             hp.batch_norm_position,
                                             is_training,
                                             name='CBHG_postnet')

                            #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                            post_outputs = post_cbhg(mel_outputs, None)

                            #Linear projection of extracted features to make linear spectrogram
                            linear_specs_projection = FrameProjection(
                                hp.num_freq,
                                scope='cbhg_linear_specs_projection')

                            #[batch_size, decoder_steps(linear_frames), num_freq]
                            linear_outputs = linear_specs_projection(
                                post_outputs)

                            if hp.clip_outputs:
                                linear_outputs = tf.minimum(
                                    tf.maximum(
                                        linear_outputs, T2_output_range[0] -
                                        hp.lower_bound_decay),
                                    T2_output_range[1])

                        #Grab alignments from the final decoder state
                        alignments = tf.transpose(
                            final_decoder_state.alignment_history.stack(),
                            [1, 2, 0])

                        self.tower_decoder_output.append(decoder_output)
                        self.tower_alignments.append(alignments)
                        self.tower_stop_token_prediction.append(
                            stop_token_prediction)
                        self.tower_mel_outputs.append(mel_outputs)
                        tower_embedded_inputs.append(embedded_inputs)
                        tower_enc_conv_output_shape.append(
                            enc_conv_output_shape)
                        tower_encoder_outputs.append(encoder_outputs)
                        tower_residual.append(residual)
                        tower_projected_residual.append(projected_residual)

                        if post_condition:
                            self.tower_linear_outputs.append(linear_outputs)
                        if use_vae:
                            self.tower_vae_codes.append(vae_code)
                        if mel_references is not None:
                            self.tower_mu.append(mu)
                            self.tower_log_var.append(log_var)

                log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        if inputs is not None:
            log('  Input:                    {}'.format(inputs.shape))
            for i in range(hp.tacotron_num_gpus):
                log('  device:                   {}'.format(i))
                log('  embedding:                {}'.format(
                    tower_embedded_inputs[i].shape))
                log('  enc conv out:             {}'.format(
                    tower_enc_conv_output_shape[i]))
                log('  encoder out:              {}'.format(
                    tower_encoder_outputs[i].shape))
                if use_vae:
                    log('  vae code:                 {}'.format(
                        self.tower_vae_codes[i].shape))
                log('  decoder out:              {}'.format(
                    self.tower_decoder_output[i].shape))
                log('  residual out:             {}'.format(
                    tower_residual[i].shape))
                log('  projected residual out:   {}'.format(
                    tower_projected_residual[i].shape))
                log('  mel out:                  {}'.format(
                    self.tower_mel_outputs[i].shape))
                if post_condition:
                    log('  linear out:               {}'.format(
                        self.tower_linear_outputs[i].shape))
                log('  <stop_token> out:         {}'.format(
                    self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #3
0
    def initialize(self,
                   inputs,
                   inputs_tone_stress,
                   speaker_labels,
                   language_labels,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			  speaker_labels: note the speaker id
			  language_labels:note the language id
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        # self.inputs_printout = inputs
        # self.inputs_tone_stress_printout = inputs_tone_stress

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths
            tower_speaker_labels = tf.split(
                speaker_labels,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0)
            # tower_language_labels = tf.split(language_labels, num_or_size_splits=hp.tacotron_num_gpus, axis=0)

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_inputs_tone_stress = tf.py_func(
                split_func, [inputs_tone_stress, split_infos[:, 0]], lout_int)
            p_language_labels = tf.py_func(
                split_func, [language_labels, split_infos[:, 0]], lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_inputs_tone_stress = []
            tower_language_labels = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                tower_inputs_tone_stress.append(
                    tf.reshape(p_inputs_tone_stress[i], [batch_size, -1]))
                tower_language_labels.append(
                    tf.reshape(p_language_labels[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.tower_predict_speaker_labels = []

        # 添加分别的phoneme embedding和 声调重读embedding 和 concat的inputs embedding
        tower_embedded_inputs_phoneme = []
        tower_embedded_inputs_tone_stress = []
        tower_embedded_inputs_concat = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # tf.print(tower_inputs[i])
                    # tf.print(tower_inputs[i])

                    # phoneme Embeddings ==> [batch_size, sequence_length, embedding_dim], 512
                    self.phoneme_embedding_table = tf.get_variable(
                        'inputs_phoneme_embedding',
                        [len(symbols), hp.phoneme_embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs_phoneme = tf.nn.embedding_lookup(
                        self.phoneme_embedding_table, tower_inputs[i])

                    # tone and stress Embeddings ==> [batch_size, sequence_length, embedding_dim], 16
                    self.tone_stress_embedding_table = tf.get_variable(
                        'inputs_tone_stress_embedding', [
                            tone_stress_symbols_max_no,
                            hp.tone_stress_embedding_dim
                        ],
                        dtype=tf.float32)
                    embedded_inputs_tone_stress = tf.nn.embedding_lookup(
                        self.tone_stress_embedding_table,
                        tower_inputs_tone_stress[i])

                    # 拼接, 512 + 16
                    embedded_inputs_concat = tf.concat(
                        [embedded_inputs_phoneme, embedded_inputs_tone_stress],
                        axis=-1)

                    self.speaker_embedding_table = tf.get_variable(
                        'speaker_embedding', [hp.speaker_num, hp.speaker_dim],
                        dtype=tf.float32)
                    embedded_speaker_label = tf.nn.embedding_lookup(
                        self.speaker_embedding_table, tower_speaker_labels[i])

                    self.language_embedding_table = tf.get_variable(
                        'language_embedding',
                        [hp.language_num, hp.language_dim],
                        dtype=tf.float32)
                    embedded_language_label = tf.nn.embedding_lookup(
                        self.language_embedding_table,
                        tower_language_labels[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs_concat,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #language concat
                    # input_seq_len = tf.shape(encoder_outputs)[1]
                    # _embedded_language_label = tf.expand_dims(embedded_language_label, axis=1)
                    # _embedded_language_label = tf.tile(_embedded_language_label, multiples=[1, input_seq_len, 1])
                    self.input_seq_len = tf.shape(encoder_outputs)[1]
                    self.language_len = tf.shape(embedded_language_label)[1]
                    self.language_id_print = tower_language_labels[i]
                    self.tone_stress_print = tower_inputs_tone_stress[i]

                    LID_encoder_outputs = tf.concat(
                        [encoder_outputs, embedded_language_label], axis=-1)

                    # Adversarial Speaker-Classifiers,	input:encoder_output,output:predicted speaker_label
                    speaker_classify = Speaker_Classifier(
                        is_training,
                        layer_size=hp.softmax_hidden_layer,
                        speaker_size=hp.speaker_num)
                    predict_speaker_labels = speaker_classify(
                        LID_encoder_outputs, hp.grad_rev_scale)

                    # Variational AutoEncoder
                    if is_training:
                        VAE_cell = VAECell(
                            VAEConvolutions(is_training,
                                            hparams=hp,
                                            scope='VAE_convolutions'),
                            VAERNN(is_training,
                                   layers=hp.VAE_lstm_num_layers,
                                   size=hp.VAE_lstm_layer_size,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='VAE_LSTM'), hp.VAE_pool_size,
                            hp.VAE_D_size)
                        residual_encoding, self.kl_div, self.D_mean, self.D_var = VAE_cell(
                            tower_mel_targets[i], hp.tacotron_batch_size)

                    elif is_evaluating:
                        residual_encoding, self.kl_div = tf.zeros(
                            [hp.tacotron_batch_size, hp.VAE_D_size],
                            dtype=tf.float32), 0
                    else:
                        residual_encoding = tf.zeros(
                            [hp.tacotron_synthesis_batch_size, hp.VAE_D_size],
                            dtype=tf.float32)
                    self.residual_encoding = residual_encoding
                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        LID_encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(
                        prenet,
                        attention_mechanism,
                        decoder_lstm,
                        embedded_speaker_label,
                        # embedded_language_label,
                        residual_encoding,
                        frame_projection,
                        stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    self.tower_predict_speaker_labels.append(
                        predict_speaker_labels)
                    tower_embedded_inputs_phoneme.append(
                        embedded_inputs_phoneme)
                    tower_embedded_inputs_tone_stress.append(
                        embedded_inputs_tone_stress)
                    tower_embedded_inputs_concat.append(embedded_inputs_concat)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(LID_encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_inputs_tone_stress = tower_inputs_tone_stress
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets
        self.tower_speaker_labels = tower_speaker_labels
        self.tower_language_labels = tower_language_labels
        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  phoneme embedding:        {}'.format(
                tower_embedded_inputs_phoneme[i].shape))
            log('  tone stress embedding:    {}'.format(
                tower_embedded_inputs_tone_stress[i].shape))
            log('  concat embedding:         {}'.format(
                tower_embedded_inputs_concat[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            #1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #4
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None):

		with tf.variable_scope('inference') as scope:
			is_training = False  # No entrenando
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams

			# Tabla de embeddings de speaker
			embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)

			#
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

			# Encoder: Capas convolucionales y LSTM
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(
					is_training,
					kernel_size=(5, ),
					channels=512,
					scope='encoder_convolutions'
				),
				EncoderRNN(
					is_training,
					size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate,
					scope='encoder_LSTM')
			)

			# Salida Encoder
			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			# Decoder ### Definir elementos para el Decoder

			# Prenet: Dos capas de 256 unidades ReLU
			prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet')

			# Red Atencion
			print("PARAMS ATTENTION", hp.attention_dim, encoder_outputs)
			attention_mechanism = LocationSensitiveAttention(
				hp.attention_dim, encoder_outputs
			)

			# Decoder LSTM
			decoder_lstm = DecoderRNN(
				is_training,
				layers=hp.decoder_layers,
				size=hp.decoder_lstm_units,
				zoneout=hp.tacotron_zoneout_rate,
				scope='decoder_lstm'
			)

			# Frame projection: proyectar resultados a 80 mels
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')

			# Stop projection: Projectar salida usando token para separar las palabras individuales
			stop_projection = StopProjection(is_training, scope='stop_token_projection')

			# Unir todo
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection,
				mask_finished=hp.mask_finished)

			# self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio)
			# Modo de sintesis
			self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
			# Poner Decoder en estado inicial - zero state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
			max_iters = hp.max_iters if not is_training else None
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=hp.impute_finished,
				maximum_iterations=max_iters)

			# Reshape outputs: 1 output por entrada
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

			# Postnet: cinco capas convolucionales
			postnet = Postnet(
				is_training,
				kernel_size=hp.postnet_kernel_size,
				channels=hp.postnet_channels,
				scope='postnet_convolutions'
			)

			# Resultados
			results = postnet(decoder_output)

			# Proyectar resultados a 80 mels
			results_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_results = results_projection(results)

			# Calcular espectrograma mel
			mel_outputs = decoder_output + projected_results

			# Tomar alineacion del ultimo estado del decoder
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			self.mel_targets = mel_targets
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   style_transfer=True,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
        Initializes the model for inference
        sets "mel_outputs" and "alignments" fields.
        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta and not style_transfer:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )
        if style_transfer and (not is_training) and (not is_evaluating):
            assert self._hparams.tacotron_style_reference_audio is not None or \
                   (self._hparams.tacotron_style_alignment is not None and
                    len(self._hparams.tacotron_style_alignment) == self._hparams.tacotron_n_style_token)

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    # For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    # style token layers
                    self.style_embedding_table = tf.get_variable(
                        'style_token_embedding',
                        [hp.tacotron_n_style_token, hp.embedding_dim],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))

                    # in order to synthese audio in random weights, style_encoder_outputs[-1]==style_embedding_table[-1]
                    if (is_training or is_evaluating) and style_transfer:
                        style_encoder_cell = TacotronReferenceEncoderCell(
                            ReferenceEncoder(
                                hp,
                                layer_sizes=hp.tacotron_reference_layer_size,
                                is_training=is_training,
                                activation=tf.nn.relu),
                            tf.nn.rnn_cell.GRUCell(
                                num_units=hp.tacotron_reference_gru_hidden_size
                            ),
                            StyleTokenLayer(
                                output_size=hp.
                                tacotron_style_encoder_outputs_size,
                                is_training=is_training),
                            hparams=hp)
                        # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2]
                        style_encoder_outputs = style_encoder_cell(
                            tower_mel_targets[i],
                            input_lengths=tower_targets_lengths[i],
                            style_token_embedding=self.style_embedding_table)
                        # [batch_size,seq_len,hp.encoder_lstm_units*2]
                        # encoder_outputs = encoder_outputs + style_encoder_outputs  # element-wise add
                        seq_len = tf.shape(encoder_outputs)[1]
                        style_encoder_outputs = tf.tile(
                            style_encoder_outputs, multiples=[1, seq_len, 1])
                        encoder_outputs = tf.concat(
                            [encoder_outputs, style_encoder_outputs],
                            axis=-1)  # concat
                    elif (not is_training) and (
                            not is_evaluating
                    ) and style_transfer:  # synthesis with style transfer
                        if hp.tacotron_style_alignment is not None and \
                                len(hp.tacotron_style_alignment) == hp.tacotron_n_style_token:
                            # random weights
                            # [n_style_tokens,]
                            style_alignment = tf.convert_to_tensor(
                                hp.tacotron_style_alignment, dtype=tf.float32)
                            # [n_style_tokens,1]*[n_style_tokens, embed_size] -> [n_style_tokens,embed_size] -> [embed_size,]
                            style_context = tf.reduce_sum(
                                tf.expand_dims(style_alignment, axis=-1) *
                                self.style_embedding_table,
                                axis=0)
                            # [1,embed_size]
                            style_context = tf.expand_dims(style_context,
                                                           axis=0)
                            # [batch_size,1,embed_size], all synthesis audio should be one style
                            style_context = tf.tile(
                                tf.expand_dims(style_context, axis=0),
                                multiples=[batch_size, 1, 1])
                            # encoder_outputs = encoder_outputs + style_context  # element-wise add
                            seq_len = tf.shape(encoder_outputs)[1]
                            style_context = tf.tile(style_context,
                                                    multiples=[1, seq_len, 1])

                            # dense?
                            # style_context = tf.layers.dense(style_context, units=hp.tacotron_style_encoder_outputs_size,
                            #                                 activation=tf.nn.tanh)
                            encoder_outputs = tf.concat(
                                [encoder_outputs, style_context],
                                axis=-1)  # concat
                        elif len(tower_mel_targets
                                 ) > 0 and tower_mel_targets[i] is not None:
                            # reference audio
                            style_encoder_cell = TacotronReferenceEncoderCell(
                                ReferenceEncoder(hp,
                                                 layer_sizes=hp.
                                                 tacotron_reference_layer_size,
                                                 is_training=is_training,
                                                 activation=tf.nn.relu),
                                tf.nn.rnn_cell.GRUCell(
                                    num_units=hp.
                                    tacotron_reference_gru_hidden_size),
                                StyleTokenLayer(
                                    output_size=hp.
                                    tacotron_style_encoder_outputs_size,
                                    is_training=is_training),
                                hparams=hp)
                            # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2(hp.tacotron_style_encoder_outputs_size)]
                            style_encoder_outputs = style_encoder_cell(
                                tower_mel_targets[i],
                                # audio style reference audio
                                input_lengths=tower_targets_lengths[i],
                                style_token_embedding=self.
                                style_embedding_table)
                            # [batch_size,seq_len,hp.encoder_lstm_units*2]
                            # encoder_outputs = encoder_outputs + style_encoder_outputs  # element-wise add
                            seq_len = tf.shape(encoder_outputs)[1]
                            style_encoder_outputs = tf.tile(
                                style_encoder_outputs,
                                multiples=[1, seq_len, 1])
                            encoder_outputs = tf.concat(
                                [encoder_outputs, style_encoder_outputs],
                                axis=-1)  # concat

                    # Decoder Parts
                    # Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    # Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    # Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    # Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    # <stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    # only generate r frames(num_mels*r) in one decoder step, [batch_size,1,num_mels*r]
                    # then put this decoder_cell with helper into dynamic_decode to generate [batch_size,decoder_steps,num_mels*r]
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    # Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    # initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    # Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    # Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    # Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    # Project residual to same dimension as mel spectrogram
                    # ==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    # Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        # [batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        # Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        # [batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    # Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            # 1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #6
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None):

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=(5, ),
                                    channels=512,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            # Define elements for decoder
            prenet = Prenet(is_training,
                            layer_sizes=[256, 256],
                            scope='decoder_prenet')
            # Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            # Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            # Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            # <stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            if is_training is True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            max_iters = hp.max_iters if not is_training else None

            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            # Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            # Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            # Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            # Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
Exemple #7
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   split_infos=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.device('/cpu:0'):
            hp = self._hparams
            lout_int = [tf.int32] * hp.num_gpus
            lout_float = [tf.float32] * hp.num_gpus

            tower_input_lengths = tf.split(input_lengths,
                                           num_or_size_splits=hp.num_gpus,
                                           axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths, num_or_size_splits=hp.num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(split_func,
                                       [mel_targets, split_infos[:, 1]],
                                       lout_float)
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                tower_mel_targets.append(
                    tf.reshape(p_mel_targets[i],
                               [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []
        self.tower_linear_targets = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus)
        ]
        for i in range(hp.num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tower_input_lengths[i],
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_lstm')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i],
                            tower_stop_token_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                        #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                        post_processing_cell = TacotronEncoderCell(
                            EncoderConvolutions(
                                is_training,
                                hparams=hp,
                                scope='post_processing_convolutions'),
                            EncoderRNN(is_training,
                                       size=hp.encoder_lstm_units,
                                       zoneout=hp.tacotron_zoneout_rate,
                                       scope='post_processing_LSTM'))

                        expand_outputs = post_processing_cell(mel_outputs)
                        linear_outputs = FrameProjection(
                            hp.num_freq,
                            scope='post_processing_projection')(expand_outputs)

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
                        self.tower_linear_targets.append(linear_targets)
                    log('initialiized done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.num_gpus + hp.gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.towerlinear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):

        hp = self._hparams
        batch_size = tf.shape(inputs)[0]
        gta = False

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        with tf.variable_scope('inference') as scope:
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            self.embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32)

            embedded_inputs = tf.nn.embedding_lookup(self.embedding_table,
                                                     inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            self.enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = GMMAttention(self.encoder_outputs,
                                               input_lengths, is_training)

            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               num_attn_mixture=5)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            self.decoder_output = tf.reshape(frames_prediction,
                                             [batch_size, -1, hp.num_mels])
            self.stop_token_prediction = tf.reshape(stop_token_prediction,
                                                    [batch_size, -1])

            if hp.clip_outputs:
                self.decoder_output = tf.minimum(
                    tf.maximum(self.decoder_output,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(self.decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            self.projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            self.mel_outputs = self.decoder_output + self.projected_residual

            if hp.clip_outputs:
                self.mel_outputs = tf.minimum(
                    tf.maximum(self.mel_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
            post_cbhg = CBHG(hp.cbhg_kernels,
                             hp.cbhg_conv_channels,
                             hp.cbhg_pool_size,
                             [hp.cbhg_projection, hp.num_mels],
                             hp.cbhg_projection_kernel_size,
                             hp.cbhg_highwaynet_layers,
                             hp.cbhg_highway_units,
                             hp.cbhg_rnn_units,
                             hp.batch_norm_position,
                             is_training,
                             name='CBHG_postnet')

            #[batch_size, decoder_steps(mel_frames), cbhg_channels]
            self.post_outputs = post_cbhg(self.mel_outputs, None)

            #Linear projection of extracted features to make linear spectrogram
            linear_specs_projection = FrameProjection(
                hp.num_freq, scope='cbhg_linear_specs_projection')

            #[batch_size, decoder_steps(linear_frames), num_freq]
            self.linear_outputs = linear_specs_projection(self.post_outputs)

            if hp.clip_outputs:
                self.linear_outputs = tf.minimum(
                    tf.maximum(self.linear_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Grab alignments from the final decoder state
            self.alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            log('initialisation done.')

        if is_training:
            self.ratio = self.helper._ratio

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        self.targets_lengths = targets_lengths
        self.stop_token_targets = stop_token_targets
        self.gta = gta
        self.all_vars = tf.trainable_variables()
        self.is_training = is_training
        self.is_evaluating = is_evaluating

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        log('  embedding:                {}'.format(embedded_inputs.shape))
        log('  enc conv out:             {}'.format(
            self.enc_conv_output_shape))
        log('  encoder out:              {}'.format(
            self.encoder_outputs.shape))
        log('  decoder out:              {}'.format(self.decoder_output.shape))
        log('  residual out:             {}'.format(residual.shape))
        log('  projected residual out:   {}'.format(
            self.projected_residual.shape))
        log('  mel out:                  {}'.format(self.mel_outputs.shape))
        log('  linear out:               {}'.format(self.linear_outputs.shape))
        log('  <stop_token> out:         {}'.format(
            self.stop_token_prediction.shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Exemple #9
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Exemple #10
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   gta=False,
                   reference_mel=None):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if gta == False and self._hparams.predict_linear == True and linear_targets is None:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
            if hp.use_vae:
                if is_training:
                    reference_mel = mel_targets

                style_embeddings, mu, log_var = VAE(inputs=reference_mel,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                    EncoderConvolutions(is_training,
                                        kernel_size=hp.enc_conv_kernel_size,
                                        channels=hp.enc_conv_channels,
                                        scope='post_processing_convolutions'),
                    EncoderRNN(is_training,
                               size=hp.encoder_lstm_units,
                               zoneout=hp.tacotron_zoneout_rate,
                               scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(
                    hp.num_freq,
                    scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.reference_mel = reference_mel
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Exemple #11
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None,
                   mel_references=None,
                   speaker_id_target=None,
                   nb_speaker=None,
                   nb_sample=None,
                   synthesize=False):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			  ### With tacotron_phoneme_transcription=True :
			  ### inputs: int32 Tensor with shape [N, T_in, T_cat] where N is batch size, T_in is number of
			  ### steps in the input time series, T_cat is the number of categories used to describe phones/special
			  ### characters and values are phones/special character encoding
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            if self._hparams.tacotron_multi_speaker:
                tower_speaker_id_target = tf.split(
                    speaker_id_target,
                    num_or_size_splits=hp.tacotron_num_gpus,
                    axis=0
                ) if speaker_id_target is not None else speaker_id_target

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                if not self._hparams.tacotron_phoneme_transcription:
                    tower_inputs.append(
                        tf.reshape(p_inputs[i], [batch_size, -1]))
                else:
                    tower_inputs.append(
                        tf.reshape(p_inputs[i],
                                   [batch_size, -1, nb_categories]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        if self._hparams.tacotron_multi_speaker:
            tower_speaker_id_pred = []

        # 1. Declare GPU Devices
        gpus = [
            "/gpu:{}".format(i)
            for i in range(hp.tacotron_gpu_start_idx,
                           hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)
        ]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    if not self._hparams.tacotron_phoneme_transcription:
                        # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                        self.embedding_table = tf.get_variable(
                            'inputs_embedding',
                            [len(symbols), hp.embedding_dim],
                            dtype=tf.float32)
                        embedded_inputs = tf.nn.embedding_lookup(
                            self.embedding_table, tower_inputs[i])
                    else:
                        # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                        # print("inputs : {}".format(inputs))
                        # print("p_inputs : {}".format(p_inputs))
                        # print("tower_inputs : {}".format(tower_inputs))
                        embedded_inputs = tf.layers.dense(
                            inputs=tf.cast(tower_inputs[i], dtype=tf.float32),
                            units=hp.embedding_dim)

                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    # For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    enc_tensor_shape = tf.shape(encoder_outputs)
                    # Speaker encoder
                    if self._hparams.tacotron_multi_speaker:
                        with tf.variable_scope('speaker_encoder'):
                            conv = tf.layers.conv1d(inputs=mel_references,
                                                    filters=512,
                                                    kernel_size=(3, ),
                                                    activation=None,
                                                    padding='same')
                            conv = tf.layers.conv1d(inputs=conv,
                                                    filters=512,
                                                    kernel_size=(3, ),
                                                    activation=None,
                                                    padding='same')
                            with tf.variable_scope('biLSTM1'):
                                fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell(
                                    num_units=256)
                                bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell(
                                    num_units=256)
                                outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn(
                                    cell_fw=fw_LSTM,
                                    cell_bw=bw_LSTM,
                                    inputs=conv,
                                    dtype=tf.float32)
                                outputs_biLSTM = tf.concat(outputs_biLSTM,
                                                           axis=2)
                            with tf.variable_scope('biLSTM2'):
                                fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell(
                                    num_units=256)
                                bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell(
                                    num_units=256)
                                outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn(
                                    cell_fw=fw_LSTM,
                                    cell_bw=bw_LSTM,
                                    inputs=outputs_biLSTM,
                                    dtype=tf.float32)
                                outputs_biLSTM = tf.concat(outputs_biLSTM,
                                                           axis=2)
                            mean_over_time = tf.reduce_mean(
                                input_tensor=outputs_biLSTM,
                                axis=1)  # shape (batch_size, speaker_emb_dim)
                            speaker_embedding = tf.layers.dense(
                                inputs=mean_over_time,
                                units=self._hparams.
                                tacotron_speaker_embedding_dim,
                                name="speaker_embedding")
                            repeat = tf.tile(speaker_embedding, [
                                1, enc_tensor_shape[1]
                            ])  # shape (batch_size, speaker_emb_dim*text_len)
                            repeat = tf.reshape(
                                repeat,
                                (enc_tensor_shape[0], enc_tensor_shape[1],
                                 self._hparams.tacotron_speaker_embedding_dim)
                            )  # shape (batch_size, text_len, speaker_emb_dim)
                            # self.embedding_speaker = tf.get_variable('speaker_embedding', [nb_sample, self._hparams.tacotron_speaker_embedding_dim], dtype=tf.float32)
                            self.embedding_speaker = speaker_embedding

                        attention_inputs = tf.concat([encoder_outputs, repeat],
                                                     axis=-1)

                        if not synthesize:
                            with tf.variable_scope('speaker_classifier'):
                                speaker_prediction = tf.layers.dense(
                                    inputs=speaker_embedding, units=256)
                                speaker_prediction = tf.layers.dense(
                                    inputs=speaker_prediction,
                                    units=nb_speaker
                                )  # Softmax inside the loss
                    else:
                        attention_inputs = encoder_outputs

                    # Decoder Parts
                    # Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    # Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        attention_inputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    # Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    # Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    # <stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    # Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    # initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    # Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    # Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    # Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    # Project residual to same dimension as mel spectrogram
                    # ==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    # Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         is_training,
                                         name='CBHG_postnet')

                        # [batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        # Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        # [batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    # Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if not synthesize:
                        if self._hparams.tacotron_multi_speaker:
                            tower_speaker_id_pred.append(speaker_prediction)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        if self._hparams.tacotron_multi_speaker:
            self.tower_speaker_id_pred = tower_speaker_id_pred
            self.tower_speaker_id_target = tower_speaker_id_target

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

            # 1_000_000 is causing syntax problems for some people?! Python please :)
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   targets_lengths=None,
                   global_step=None,
                   is_training=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [len(symbols), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        is_training=is_training,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (is_training) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    if hp.clip_outputs:
                        decoder_output = tf.minimum(
                            tf.maximum(
                                decoder_output,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if hp.clip_outputs:
                        mel_outputs = tf.minimum(
                            tf.maximum(
                                mel_outputs,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False, reference_mel=None):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no mel targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
			post_condition = hp.predict_linear and not gta

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			if hp.use_gst:
				#Global style tokens (GST)
				gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 
					dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5))
				self.gst_tokens = gst_tokens


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
			
			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape

			if is_training:
				reference_mel = mel_targets

			if reference_mel is not None:
				# Reference encoder
				refnet_outputs = reference_encoder(
				  reference_mel, 
				  filters=hp.reference_filters, 
				  kernel_size=(3,3),
				  strides=(2,2),
				  encoder_cell=GRUCell(hp.reference_depth),
				  is_training=is_training)                                                 # [N, 128]
				self.refnet_outputs = refnet_outputs

				if hp.use_gst:
				  # Style attention
				  style_attention = MultiheadAttention(
					tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
					tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
					num_heads=hp.num_heads,
					num_units=hp.style_att_dim,
					attention_type=hp.style_att_type)

				  style_embeddings = style_attention.multi_head_attention() 
				else:
				  style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
			else:
				print("Use random weight for GST.")
				random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
				random_weights = tf.nn.softmax(random_weights, name="random_weights")
				style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
				style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])
			

			#Extend style embeddings to be compatible with encoder_outputs. 
			#Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes.
			#Preserves effect of both style and encoder_outputs.
			neg = tf.add(style_embeddings, tf.negative(style_embeddings))
			style_embeddings = tf.concat([style_embeddings, neg], axis=-1)


			# Add style embedding to every text encoder state
			style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
			encoder_outputs = tf.add(encoder_outputs, style_embeddings)   

			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
				cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells

			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection)
			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry 
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

		
			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_output)

			#Project residual to same dimension as mel spectrogram 
			#==> [batch_size, decoder_steps * r, num_mels]
			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual


			if post_condition:
				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
				post_processing_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))

				expand_outputs = post_processing_cell(mel_outputs)
				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.style_embeddings = style_embeddings
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			if post_condition:
				self.linear_outputs = linear_outputs
				self.linear_targets = linear_targets
			self.mel_targets = mel_targets
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  style embedding:         %d' % style_embeddings.shape[-1])
			log('  mel out:                  {}'.format(mel_outputs.shape))
			if post_condition:
				log('  linear out:               {}'.format(linear_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
    def initialize(self,
                   inputs,
                   input_speaker_id,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            # embedding_table = tf.get_variable(
            # 	'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Speaker Embeddings ==> [batch_size, embedding_dim]
            self.speaker_id_embedding_table = tf.get_variable(
                'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_speaker_id = tf.nn.embedding_lookup(
                self.speaker_id_embedding_table, input_speaker_id)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))
            print('inputs:', inputs)
            # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9)
            encoder_outputs = encoder_cell(inputs, input_lengths)

            #first change encoder_outputs to concated version.

            #second add. need same dims
            #encoder_outputs = encoder_outputs + embedded_speaker_id
            '''
			#first concat.
			input_seq_len = tf.shape(encoder_outputs)[1]
			print('!!!!!!!!!!before tile')
			embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1)
			embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1])
			print('!!!!!!!!!!after tile')
			id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1)
			'''
            id_encoder_outputs = encoder_outputs
            #still use encoder_outputs

            #use keras version, but not run.
            '''
			print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh')
			hp_lambda = 1.0
			Flip = GradientReversal(hp_lambda)
			Flip_encoder_outputs = Flip(encoder_outputs)
			'''
            '''
			#use tensorflow version, but star's is only 5 and i don't understand.
			Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0)
			print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs))
			densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu)
			softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax)
			
			long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1])
			tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]])
			print('tiled_speaker_id', tiled_speaker_id)
			one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num)
			print('one_hot_speaker_id', one_hot_speaker_id)
			#self.one_hot_speaker_id and self.softmax_encoder_outputs is at below
			#long_speaker_id = tf.expand_dims(long_speaker_id, axis=2)
			#dann_out = Dense(2)(dann_in)
			#Flip_encoder_outputs = 
			'''
            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                id_encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                post_cbhg = CBHG(hp.cbhg_kernels,
                                 hp.cbhg_conv_channels,
                                 hp.cbhg_pool_size,
                                 [hp.cbhg_projection, hp.num_mels],
                                 hp.cbhg_projection_kernel_size,
                                 hp.cbhg_highwaynet_layers,
                                 hp.cbhg_highway_units,
                                 hp.cbhg_rnn_units,
                                 hp.batch_norm_position,
                                 is_training,
                                 name='CBHG_postnet')

                #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                post_outputs = post_cbhg(mel_outputs, None)

                #Linear projection of extracted features to make linear spectrogram
                linear_specs_projection = FrameProjection(
                    hp.num_freq, scope='cbhg_linear_specs_projection')

                #[batch_size, decoder_steps(linear_frames), num_freq]
                linear_outputs = linear_specs_projection(post_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_speaker_id = input_speaker_id
            #self.one_hot_speaker_id and self.softmax_encoder_outputs
            #self.softmax_encoder_outputs = softmax_encoder_outputs
            #self.one_hot_speaker_id = one_hot_speaker_id
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  id encoder out:              {}'.format(
                id_encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Exemple #15
0
	def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False):
		"""
		Initializes the model for inference

		sets "feature_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are
			entries in the spectrogram. Only needed for training.
		"""
		if feature_targets is None and stop_token_targets is not None:
			raise ValueError('no feature targets were provided but token_targets were given')
		if feature_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape


			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths,
				smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells
			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding)
			decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection)


			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, target_depth, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth]
			decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth])
			stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1])


			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_outputs)

			#Project residual to same dimension as target depth
			#==> [batch_size, decoder_steps * r, target_depth]
			residual_projection = FrameProjection(target_depth, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the final outputs
			final_outputs = decoder_outputs + projected_residual

			#Compute each feature outputs
			mgc_idx = 0
			lf0_idx = mgc_idx + hp.num_mgc
			vuv_idx = lf0_idx + hp.num_lf0
			bap_idx = vuv_idx + hp.num_vuv
			mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs')
			lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0])
			lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs')
			vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs')
			bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs')


			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments')

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_outputs = decoder_outputs
			self.final_outputs = final_outputs
			self.feature_targets = feature_targets
			self.alignments = alignments
			self.stop_token_outputs = stop_token_outputs
			self.stop_token_targets = stop_token_targets
			self.lf0_outputs = lf0_outputs
			self.mgc_outputs = mgc_outputs
			self.vuv_outputs = vuv_outputs
			self.bap_outputs = bap_outputs
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_outputs.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  final out:                {}'.format(final_outputs.shape))
			log('  lf0 out:                  {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape))
			log('  mgc out:                  {}'.format(mgc_outputs.shape))
			log('  vuv out:                  {}'.format(vuv_outputs.shape))
			log('  bap out:                  {}'.format(bap_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_outputs.shape))