def inference_decode(enc_outputs, seq_len, embeddings, out_dim):
    tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs,
                                                      hp.beam_width)
    tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width)

    beam_batch_size = tf.shape(tiled_enc_outputs)[0]
    # start tokens, end token
    start_tokens = tf.tile([hp.START_TOKEN],
                           [beam_batch_size // hp.beam_width])
    end_token = hp.END_TOKEN

    dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size),
                                              is_training=False,
                                              prenet_sizes=hp.embed_size,
                                              dropout_prob=hp.dropout)
    attention_mechanism = BahdanauAttention(
        hp.embed_size,
        tiled_enc_outputs,
        normalize=True,
        memory_sequence_length=tiled_seq_len,
        probability_fn=tf.nn.softmax)
    attn_cell = AttentionWrapper(dec_prenet_outputs,
                                 attention_mechanism,
                                 alignment_history=True,
                                 output_attention=False)
    concat_cell = ConcatOutputAndAttentionWrapper(attn_cell)
    decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, hp.embed_size),
        ResidualWrapper(GRUCell(hp.embed_size)),
        ResidualWrapper(GRUCell(hp.embed_size))
    ],
                                state_is_tuple=True)

    output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
    initial_state = output_cell.zero_state(batch_size=beam_batch_size,
                                           dtype=tf.float32)

    decoder = BeamSearchDecoder(cell=output_cell,
                                embedding=embeddings,
                                start_tokens=start_tokens,
                                end_token=end_token,
                                initial_state=initial_state,
                                beam_width=hp.beam_width)
    outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode(
        decoder, maximum_iterations=hp.max_len)
    return outputs
def training_decode(enc_outputs, seq_len, helper, out_dim):
    dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size),
                                              is_training=True,
                                              prenet_sizes=hp.embed_size,
                                              dropout_prob=hp.dropout)
    attention_mechanism = BahdanauAttention(hp.embed_size,
                                            enc_outputs,
                                            normalize=True,
                                            memory_sequence_length=seq_len,
                                            probability_fn=tf.nn.softmax)
    attn_cell = AttentionWrapper(dec_prenet_outputs,
                                 attention_mechanism,
                                 alignment_history=True,
                                 output_attention=False)
    concat_cell = ConcatOutputAndAttentionWrapper(attn_cell)
    decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, hp.embed_size),
        ResidualWrapper(GRUCell(hp.embed_size)),
        ResidualWrapper(GRUCell(hp.embed_size))
    ],
                                state_is_tuple=True)

    output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
    initial_state = output_cell.zero_state(batch_size=tf.shape(enc_outputs)[0],
                                           dtype=tf.float32)

    decoder = BasicDecoder(cell=output_cell,
                           helper=helper,
                           initial_state=initial_state)

    (outputs, _), last_state, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder=decoder, maximum_iterations=hp.max_len)
    # for attention plot
    alignments = tf.transpose(last_state[0].alignment_history.stack(),
                              [1, 2, 0])
    return outputs, alignments
Exemple #3
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets_pos=None,
                   linear_targets_pos=None,
                   mel_targets_neg=None,
                   linear_targets_neg=None,
                   labels_pos=None,
                   labels_neg=None,
                   reference_mel_pos=None,
                   reference_mel_neg=None):

        is_training = linear_targets_pos is not None
        is_teacher_force_generating = mel_targets_pos is not None
        batch_size = tf.shape(inputs)[0]
        hp = self._hparams

        ## Text Encoding scope
        with tf.variable_scope('text_encoder', reuse=tf.AUTO_REUSE) as scope:
            # Initialize Text Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Text Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            content_inputs = encoder_outputs

        ## Reference Encoding Scope
        with tf.variable_scope('audio_encoder', reuse=tf.AUTO_REUSE) as scope:

            if hp.use_gst:
                #Global style tokens (GST)
                gst_tokens = tf.get_variable(
                    'style_tokens', [hp.num_gst, 256 // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

            if is_training:

                reference_mel_pos = mel_targets_pos
                reference_mel_neg = mel_targets_neg

            if reference_mel_pos is not None:
                # Reference encoder
                refnet_outputs_pos = reference_encoder(
                    reference_mel_pos,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(128),
                    is_training=is_training)  # [n, 128]
                self.refnet_outputs_pos = refnet_outputs_pos

                refnet_outputs_neg = reference_encoder(
                    reference_mel_neg,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(128),
                    is_training=is_training)  # [n, 128]
                self.refnet_outputs_neg = refnet_outputs_neg
                # Extract style features
                ref_style = style_encoder(reference_mel_neg,
                                          filters=[32, 32, 64, 64],
                                          kernel_size=(3, 3),
                                          strides=(2, 2),
                                          is_training=False)
                self.ref_style = ref_style

                if hp.use_gst:
                    # Multi-head attention
                    style_attention_pos = MultiheadAttention(
                        tf.tanh(tf.expand_dims(refnet_outputs_pos,
                                               axis=1)),  # [N, 1, 128]
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ]),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=128,
                        attention_type=hp.style_att_type)

                    style_attention_neg = MultiheadAttention(
                        tf.tanh(tf.expand_dims(refnet_outputs_neg,
                                               axis=1)),  # [N, 1, 128]
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ]),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=128,
                        attention_type=hp.style_att_type)

                    # Apply tanh to compress both encoder state and style embedding to the same scale.

                    style_embeddings_pos = style_attention_pos.multi_head_attention(
                    )  # [N, 1, 256]
                    style_embeddings_neg = style_attention_neg.multi_head_attention(
                    )  # [N, 1, 256]

                else:
                    style_embeddings_pos = tf.expand_dims(
                        refnet_outputs_pos, axis=1)  # [N, 1, 128]
                    style_embeddings_neg = tf.expand_dims(refnet_outputs_neg,
                                                          axis=1)
            else:
                print("Use random weight for GST.")

            # Add style embedding to every text encoder state
            ## tile style embeddings such that it could matched with text sequence shape,
            ## format: _content_style
            style_embeddings_pos = tf.tile(
                style_embeddings_pos,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            style_embeddings_neg = tf.tile(
                style_embeddings_neg,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            ## purmute four encoder outputs, e.g. pos2pos is positive content wieh positive style, pos2neg is postive content wity
            ## negtive style.
            encoder_outputs_pos = tf.concat(
                [encoder_outputs, style_embeddings_pos], axis=-1)
            encoder_outputs_neg = tf.concat(
                [encoder_outputs, style_embeddings_neg], axis=-1)

        # Decoding scope
        with tf.variable_scope('generator', reuse=tf.AUTO_REUSE) as scope:
            # RNN Attention
            attention_cell_pos = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs_pos,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            attention_cell_neg = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs_neg,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell_pos = ConcatOutputAndAttentionWrapper(
                attention_cell_pos)
            concat_cell_neg = ConcatOutputAndAttentionWrapper(
                attention_cell_neg)

            # Decoder (layers specified bottom to top):
            decoder_cell_pos = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell_pos, 256),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
                ],
                state_is_tuple=True)  # [N, T_in, 256]

            decoder_cell_neg = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell_neg, 256),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
                ],
                state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell_pos = OutputProjectionWrapper(
                decoder_cell_pos, hp.num_mels * hp.outputs_per_step)
            decoder_init_state_pos = output_cell_pos.zero_state(
                batch_size=batch_size, dtype=tf.float32)

            output_cell_neg = OutputProjectionWrapper(
                decoder_cell_neg, hp.num_mels * hp.outputs_per_step)
            decoder_init_state_neg = output_cell_neg.zero_state(
                batch_size=batch_size, dtype=tf.float32)

            if is_training or is_teacher_force_generating:
                helper_pos = TacoTrainingHelper(inputs, mel_targets_pos,
                                                hp.num_mels,
                                                hp.outputs_per_step)
                helper_neg = TacoTrainingHelper(inputs, mel_targets_neg,
                                                hp.num_mels,
                                                hp.outputs_per_step)

            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs_pos, _
             ), final_decoder_state_pos, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell_pos, helper_pos,
                              decoder_init_state_pos),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            (decoder_outputs_neg, _
             ), final_decoder_state_neg, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell_neg, helper_neg,
                              decoder_init_state_neg),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry

            mel_outputs_pos = tf.reshape(
                decoder_outputs_pos,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]
            mel_outputs_neg = tf.reshape(
                decoder_outputs_neg,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs_pos = post_cbhg(mel_outputs_pos, hp.num_mels,
                                         is_training)  # [N, T_out, 256]
            linear_outputs_pos = tf.layers.dense(post_outputs_pos,
                                                 hp.num_freq)  # [N, T_out, F]

            post_outputs_neg = post_cbhg(mel_outputs_neg, hp.num_mels,
                                         is_training)  # [N, T_out, 256]
            linear_outputs_neg = tf.layers.dense(post_outputs_neg,
                                                 hp.num_freq)  # [N, T_out, F]

            ## Grab alignments from the final decoder state:
            alignments_pos = tf.transpose(
                final_decoder_state_pos[0].alignment_history.stack(),
                [1, 2, 0])
            alignments_neg = tf.transpose(
                final_decoder_state_neg[0].alignment_history.stack(),
                [1, 2, 0])

            # Extract style features for fake sample
            rec_style = style_encoder(mel_outputs_neg,
                                      filters=[32, 32, 64, 64],
                                      kernel_size=(3, 3),
                                      strides=(2, 2),
                                      is_training=False)
            self.rec_style = rec_style

        # Discriminator scope
        with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE) as scope:
            self.real_logit = discriminator(content_inputs,
                                            reference_mel_pos,
                                            is_training=is_training)
            self.fake_logit_pos = discriminator(content_inputs,
                                                mel_outputs_pos,
                                                is_training=is_training)
            self.fake_logit_neg = discriminator(content_inputs,
                                                mel_outputs_neg,
                                                is_training=is_training)

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_outputs_pos = mel_outputs_pos
        self.mel_outputs_neg = mel_outputs_neg

        self.encoder_outputs = encoder_outputs

        self.style_embeddings_pos = style_embeddings_pos
        self.style_embeddings_neg = style_embeddings_neg

        self.linear_outputs_pos = linear_outputs_pos
        self.linear_outputs_neg = linear_outputs_neg

        self.alignments_pos = alignments_pos
        self.alignments_neg = alignments_neg
        self.mel_targets_pos = mel_targets_pos
        self.mel_targets_neg = mel_targets_neg
        self.linear_targets_pos = linear_targets_pos
        self.linear_targets_neg = linear_targets_neg
        self.reference_mel_pos = reference_mel_pos
        self.reference_mel_neg = reference_mel_neg
        log('Initialized Tacotron model. Dimensions: ')
        log('text embedding:          %d' % embedded_inputs.shape[-1])
Exemple #4
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mels=None):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
            inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
            input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
            linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
            reference_mels: the reference encoder inputs
        """
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings for character inputs: [N, T_in]
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Whether use Global Style Token
            if is_training:
                reference_mels = mel_targets

            if hp.use_gst:
                gst_tokens = tf.get_variable(
                    'style_tokens', [hp.num_tokens, 256 // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

                # Reference Encoder
                _, reference_encoder_outputs = reference_encoder(
                    inputs=reference_mels,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    is_training=is_training)  # [N, 128]
                # Style Token Layer Using Multi-Head Attention
                style_attention = MultiHeadAttention(
                    num_heads=hp.num_heads,
                    num_units=128,
                    attention_type=hp.attention_type)
                style_embedding = tf.nn.tanh(
                    style_attention.multi_head_attention(
                        query=tf.expand_dims(reference_encoder_outputs,
                                             axis=1),  # [N, 1, 128]
                        value=tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                      [batch_size, 1, 1
                                       ])  # [N, num_tokens, 256/num_heads]
                    ))  # [N, 1, 128]

                # add style embedding to encoder outputs
                T_in = shape_list(encoder_outputs)[1]
                style_embedding = tf.tile(style_embedding, [1, T_in, 1])
                encoder_outputs = tf.concat([encoder_outputs, style_embedding],
                                            axis=-1)

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top),
            # fix decoder cell from gru to lstm and add zoneout
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1,
                                               is_training)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1,
                                               is_training))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #5
0
	def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		with tf.variable_scope('inference') as scope:
			is_training = mel_targets is not None and not gta
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams

			# Embeddings
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
				initializer=tf.contrib.layers.xavier_initializer())
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

			#Encoder
			enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training,
				kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels)    
			#Paper doesn't specify what to do with final encoder state
			#So we will simply drop it
			encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths,
				'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units,
				zoneout=hp.zoneout_rate)     

			#Attention
			attention_cell = AttentionWrapper(
				DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism
					zoneout_factor_cell=hp.zoneout_rate,							#based on original tacotron architecture
					zoneout_factor_output=hp.zoneout_rate), is_training),
				LocationSensitiveAttention(hp.attention_dim, encoder_outputs),
				alignment_history=True,
				output_attention=False,
				name='attention_cell')

			#Concat Prenet output with context vector
			concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell)

			#Decoder layers (attention pre-net + 2 unidirectional LSTM Cells)
			decoder_cell = unidirectional_LSTM(concat_cell, is_training,
				layers=hp.decoder_layers, size=hp.decoder_lstm_units,
				zoneout=hp.zoneout_rate)

			#Concat LSTM output with context vector
			concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell)

			#Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation)
			output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step)

			#Define the helper for our decoder
			if (is_training or gta) == True:
				self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

			#We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
			max_iterations = None if is_training else hp.max_iters

			#initial decoder state
			decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Decode
			(decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode(
				CustomDecoder(output_cell, self.helper, decoder_init_state),
				impute_finished=True, #Cut out padded parts (enabled)
				maximum_iterations=max_iterations)

			# Reshape outputs to be one output per entry 
			decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels])

			#Compute residual using post-net
			residual = postnet(decoder_output, is_training,
				kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels)

			#Project residual to same dimension as mel spectrogram
			projected_residual = projection(residual,
				shape=hp.num_mels,
				scope='residual_projection')

			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.mel_outputs = mel_outputs
			self.mel_targets = mel_targets
			log('Initialized Tacotron model. Dimensions: ')
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_outputs.shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  mel out:                  {}'.format(mel_outputs.shape))
Exemple #6
0
  def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      is_teacher_force_generating = mel_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      # embedding_table = tf.get_variable(
      #   'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
      #   initializer=tf.truncated_normal_initializer(stddev=0.5))
      # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
      
      if hp.use_gst:
        #Global style tokens (GST)
        gst_tokens = tf.get_variable(
          'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32,
          initializer=tf.truncated_normal_initializer(stddev=0.5))
        self.gst_tokens = gst_tokens
 
      # Encoder
      # prenet_outputs = prenet(embedded_inputs, is_training)
      prenet_outputs = prenet(inputs, is_training)
      # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]
      


      if inputs_jp  is not None:
        # Reference encoder
        refnet_outputs = reference_encoder(
          inputs_jp,
          filters=hp.reference_filters, 
          kernel_size=(3,3),
          strides=(2,2),
          encoder_cell=GRUCell(hp.reference_depth),
          is_training=is_training)                                                 # [N, 128]
        self.refnet_outputs = refnet_outputs                                       

        if hp.use_gst:
          # Style attention
          style_attention = MultiheadAttention(
            tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
            tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

          style_embeddings = style_attention.multi_head_attention()                   # [N, 1, 256]
        else:
          style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
      else:
        print("Use random weight for GST.")
        random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
        random_weights = tf.nn.softmax(random_weights, name="random_weights")
        style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
        style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

      # Add style embedding to every text encoder state
      style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
      encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1)

      # Attention
      attention_cell = AttentionWrapper(
        GRUCell(hp.attention_depth),
        BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
          OutputProjectionWrapper(concat_cell, hp.rnn_depth),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
        ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training or is_teacher_force_generating:
        helper = TacoTrainingHelper(inputs, mel_targets, hp)
      else:
        helper = TacoTestHelper(batch_size, hp)

      (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
        BasicDecoder(output_cell, helper, decoder_init_state),
        maximum_iterations=hp.max_iters)                                        # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.encoder_outputs = encoder_outputs
      self.style_embeddings = style_embeddings
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      self.inputs_jp = inputs_jp
      log('Initialized Tacotron model. Dimensions: ')
      log('  style embedding:         %d' % style_embeddings.shape[-1])
      log('  prenet out:              %d' % prenet_outputs.shape[-1])
      log('  encoder out:             %d' % encoder_outputs.shape[-1])
      log('  attention out:           %d' % attention_cell.output_size)
      log('  concat attn & out:       %d' % concat_cell.output_size)
      log('  decoder cell out:        %d' % decoder_cell.output_size)
      log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      log('  postnet out:             %d' % post_outputs.shape[-1])
      log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #7
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

        Sets "pml_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (multi_decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            decoder_outputs = tf.reshape(
                multi_decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Postnet: predicts a residual
            postnet_outputs = postnet(decoder_outputs,
                                      layers=hp.postnet_conv_layers,
                                      conv_width=hp.postnet_conv_width,
                                      channels=hp.postnet_conv_channels,
                                      is_training=is_training)

            pml_outputs = decoder_outputs + postnet_outputs

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, multi_decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % pml_outputs.shape[-1])
Exemple #8
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id,
                   mel_targets=None,
                   linear_targets=None,
                   loss_coeff=None,
                   rnn_decoder_test_mode=False,
                   is_randomly_initialized=False):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly

                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None

                else:
                    raise Exception(
                        " [!] Unknown multi-speaker model type: {}".format(
                            hp.model_type))

            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.dec_rnn_size),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            # [N, T_out, postnet_depth=256]
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Exemple #10
0
def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets):
    """ Decoder
  
  Prenet -> Attention RNN
  Postprocessing CBHG

  @param    encoder_outputs   outputs from the encoder wtih shape [N, T_in, prenet_depth=256]
  @param    inputs              int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
                                steps in the input time series, and values are character IDs
  @param    is_training         flag for training or eval
  @param    batch_size          number of samples per batch
  @param    mel_targets         float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
                                of steps in the output time series, M is num_mels, and values are entries in the mel
  @param    output_cell         attention cell
  @param    decoder_init_state  initial state of the decoder

  @return                       linear_outputs, mel_outputs and alignments
  """

    if (is_training):
        helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels,
                                    hparams.outputs_per_step)
    else:
        helper = TacoTestHelper(batch_size, hparams.num_mels,
                                hparams.outputs_per_step)

    # Attention
    attention_cell = AttentionWrapper(
        GRUCell(hparams.attention_depth),
        BahdanauAttention(hparams.attention_depth, encoder_outputs),
        alignment_history=True,
        output_attention=False)  # [N, T_in, attention_depth=256]

    # Apply prenet before concatenation in AttentionWrapper.
    attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                          hparams.prenet_depths)

    # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
    concat_cell = ConcatOutputAndAttentionWrapper(
        attention_cell)  # [N, T_in, 2*attention_depth=512]

    # Decoder (layers specified bottom to top):
    decoder_cell = MultiRNNCell(
        [
            OutputProjectionWrapper(concat_cell, hparams.decoder_depth),
            ResidualWrapper(GRUCell(hparams.decoder_depth)),
            ResidualWrapper(GRUCell(hparams.decoder_depth))
        ],
        state_is_tuple=True)  # [N, T_in, decoder_depth=256]

    # Project onto r mel spectrograms (predict r outputs at each RNN step):
    output_cell = OutputProjectionWrapper(
        decoder_cell, hparams.num_mels * hparams.outputs_per_step)

    decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                dtype=tf.float32)

    (decoder_outputs,
     _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
         BasicDecoder(output_cell, helper, decoder_init_state),
         maximum_iterations=hparams.max_iters)  # [N, T_out/r, M*r]

    # Reshape outputs to be one output per entry
    mel_outputs = tf.reshape(
        decoder_outputs, [batch_size, -1, hparams.num_mels])  # [N, T_out, M]

    # Add post-processing CBHG:
    post_outputs = post_cbhg(
        mel_outputs,
        hparams.num_mels,
        is_training,  # [N, T_out, postnet_depth=256]
        hparams.postnet_depth)
    linear_outputs = tf.layers.dense(post_outputs,
                                     hparams.num_freq)  # [N, T_out, F]

    # Grab alignments from the final decoder state:
    alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(),
                              [1, 2, 0])

    log('Decoder Network ...')
    log('  attention out:             %d' % attention_cell.output_size)
    log('  concat attn & out:         %d' % concat_cell.output_size)
    log('  decoder cell out:          %d' % decoder_cell.output_size)
    log('  decoder out (%d frames):   %d' %
        (hparams.outputs_per_step, decoder_outputs.shape[-1]))
    log('  decoder out (1 frame):     %d' % mel_outputs.shape[-1])
    log('  postnet out:               %d' % post_outputs.shape[-1])
    log('  linear out:                %d' % linear_outputs.shape[-1])

    return linear_outputs, mel_outputs, alignments
Exemple #11
0
    def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          speaker_ids: int32 Tensor containing ids of specific speakers
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference'):
            is_training = linear_targets is not None
            batch_size = tf.shape(text_inputs)[0]
            hp = self._hparams
            vocab_size = len(symbols)
            embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim)  # [N, T_in, embd_size]

            # extract speaker embedding if multi-speaker
            with tf.variable_scope('speaker'):
                if hp.num_speakers > 1:
                    speaker_embedding = tf.get_variable('speaker_embed',
                                                        shape=(hp.num_speakers, hp.speaker_embed_dim),
                                                        dtype=tf.float32)
                    # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)?
                    speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids)
                else:
                    speaker_embd = None
            # Encoder
            prenet_outputs = prenet(inputs=embedded_inputs,
                                    drop_rate=hp.drop_rate if is_training else 0.0,
                                    is_training=is_training,
                                    layer_sizes=hp.encoder_prenet,
                                    scope="prenet")  # [N, T_in, 128]
            encoder_outputs = cbhg(prenet_outputs, input_lengths,
                                   speaker_embd=speaker_embd,
                                   is_training=is_training,
                                   K=hp.encoder_cbhg_banks,
                                   c=hp.encoder_cbhg_bank_sizes,  # [N, T_in, 256]
                                   scope='encoder_cbhg')

            # Attention Mechanism
            attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training,
                                               speaker_embd=speaker_embd, attention_type=hp.attention_type)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(attention_cell, hp.decoder_dim),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim)),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim))  # 256
            ], state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell, helper, decoder_init_state),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing
            post_outputs = cbhg(mel_outputs, None,
                                speaker_embd=None,
                                is_training=is_training,
                                K=hp.post_cbhg_banks,
                                c=hp.post_cbhg_bank_sizes + [hp.num_mels],
                                scope='post_cbhg')  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = text_inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.audio = audio.inv_spectrogram_tensorflow(linear_outputs)
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            # TODO: later work around for getting info back?
            # log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % attention_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #12
0
    def decode(self, encoder_outputs, batch_size):
        # Attention
        attention_cell = AttentionWrapper(
            DecoderPrenetWrapper(GRUCell(self._hparams.get('attention_depth')),
                                 self._is_training,
                                 self._hparams.get('prenet_depths')),
            BahdanauAttention(self._hparams.get('attention_depth'),
                              encoder_outputs),
            alignment_history=True,
            output_attention=False)  # [N, T_in, attention_depth=256]

        # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
        concat_cell = ConcatOutputAndAttentionWrapper(
            attention_cell)  # [N, T_in, 2*attention_depth=512]

        # Decoder (layers specified bottom to top):
        decoder_cell = MultiRNNCell(
            [
                OutputProjectionWrapper(concat_cell,
                                        self._hparams.get('decoder_depth')),
                ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))),
                ResidualWrapper(GRUCell(self._hparams.get('decoder_depth')))
            ],
            state_is_tuple=True)  # [N, T_in, decoder_depth=256]

        # Project onto r mel spectrograms (predict r outputs at each RNN step):
        output_cell = OutputProjectionWrapper(
            decoder_cell,
            self._hparams.get('num_mels') *
            self._hparams.get('outputs_per_step'))
        decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                    dtype=tf.float32)

        (decoder_outputs, _), final_decoder_state, _ = dynamic_decode(
            BasicDecoder(output_cell, self._helper, decoder_init_state),
            maximum_iterations=self._hparams.get(
                'max_iters'))  # [N, T_out/r, M*r]

        mel_outputs = tf.reshape(
            decoder_outputs,
            [batch_size, -1, self._hparams.get('num_mels')])

        # Post processing CHBG
        kwargs = {
            'K': self._hparams.get('decoder_K'),
            'bank_num_filters': self._hparams.get('decoder_bank_num_filters'),
            'pooling_stride': self._hparams.get('decoder_pooling_stride'),
            'pooling_width': self._hparams.get('decoder_pooling_width'),
            'proj_num_filters': self._hparams.get('decoder_proj_num_filters'),
            'proj_filter_width':
            self._hparams.get('decoder_proj_filter_width'),
            'num_highway_layers':
            self._hparams.get('decoder_num_highway_layers'),
            'highway_depth': self._hparams.get('decoder_highway_depth'),
            'gru_num_cells': self._hparams.get('decoder_gru_num_cells')
        }
        post_out = cbhg(mel_outputs, None, self._is_training, 'post_cbhg',
                        **kwargs)
        lin_outputs = tf.layers.dense(post_out, self._hparams.get('num_freq'))

        return mel_outputs, lin_outputs, final_decoder_state
Exemple #13
0
def attention_decoder(inputs,
                      memory,
                      num_units=None,
                      is_training=True,
                      alignment_history=True,
                      scope="attention_decoder",
                      reuse=None):
    '''Applies a GRU to `inputs`, while attending `memory`.
    Args:
      inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
      num_units: An int. Attention size.
      memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
      scope: Optional scope for `variable_scope`.  
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    
    Returns:
      Tuple:
      A 3d tensor with shape of [N, T, num_units]. 
      AttentionWrapper final state. 
    '''
    with tf.variable_scope(scope, reuse=reuse):
        if num_units is None:
            num_units = inputs.get_shape().as_list()[-1]
        batch_size = inputs.get_shape().as_list()[0]

        attention_mecanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units, memory)
        decoder_cell = DecoderPrenetWrapper(GRUCell(num_units), is_training)
        cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell,
            attention_mecanism,
            num_units,
            alignment_history=alignment_history,
            output_attention=False)
        # Concatenate attention context vector and RNN cell output into a 512D vector.
        concat_cell = ConcatOutputAndAttentionWrapper(cell_with_attention)
        decoder_cell = MultiRNNCell([
            OutputProjectionWrapper(concat_cell, num_units),
            ResidualWrapper(GRUCell(num_units)),
            ResidualWrapper(GRUCell(num_units))
        ],
                                    state_is_tuple=True)
        # Outputs => (N, T', hp.n_mels*hp.r)
        out_dim = inputs.get_shape().as_list()[-1]
        output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
        decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                    dtype=tf.float32)

        if is_training:
            helper = TacoTrainingHelper(memory, inputs, hp.n_mels * hp.r, hp.r)
        else:
            helper = TacoTesthelper(batch_size, hp.n_mels * hp.r, hp.r)
        (decoder_outputs,
         _), final_state, _ = tf.contrib.seq2seq.dynamic_decode(
             BasicDecoder(output_cell, helper, decoder_init_state),
             maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

        # Reshape outputs to be one output per entry
        # mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.n_mels])  # [N, T_out, M]
    return decoder_outputs, final_state[0]
Exemple #14
0
class Tacotron():
    def __init__(self, hparams):
        self._hparams = hparams

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            self.batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            self.output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            self.decoder_init_state = self.output_cell.zero_state(
                batch_size=self.batch_size, dtype=tf.float32)

            if is_training:
                self.helper = TacoTrainingHelper(inputs, mel_targets,
                                                 hp.num_mels,
                                                 hp.outputs_per_step)
            else:
                self.helper = TacoTestHelper(self.batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(self.output_cell, self.helper,
                              self.decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [self.batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])

    def update(self, hparams):
        with tf.variable_scope('inference') as scope:
            self._hparams = hparams
            hp = self._hparams
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(self.output_cell, self.helper,
                              self.decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [self.batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs,
                                     hp.num_mels,
                                     is_training=False,
                                     is_updating=True)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq,
                                             reuse=True)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            log('Updated Tacotron model.')

    def add_loss(self):
        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
        with tf.variable_scope('loss') as scope:
            hp = self._hparams
            self.mel_loss = tf.reduce_mean(
                tf.abs(self.mel_targets - self.mel_outputs))
            l1 = tf.abs(self.linear_targets - self.linear_outputs)
            # Prioritize loss for frequencies under 3000 Hz.
            n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq)
            self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
                l1[:, :, 0:n_priority_freq])
            self.loss = self.mel_loss + self.linear_loss

    def add_optimizer(self, global_step):
        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.

    Args:
      global_step: int32 scalar Tensor representing current global step in training
    '''
        with tf.variable_scope('optimizer') as scope:
            hp = self._hparams
            if hp.decay_learning_rate:
                self.learning_rate = _learning_rate_decay(
                    hp.initial_learning_rate, global_step)
            else:
                self.learning_rate = tf.convert_to_tensor(
                    hp.initial_learning_rate)
            optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                               hp.adam_beta1, hp.adam_beta2)
            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
            self.gradients = gradients
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
            # https://github.com/tensorflow/tensorflow/issues/1122
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                self.optimize = optimizer.apply_gradients(
                    zip(clipped_gradients, variables), global_step=global_step)
Exemple #15
0
    def __init__(self,
                 hparams,
                 is_training=False,
                 with_target=True,
                 reuse=False):
        self.with_target = with_target
        self.hparams = hparams
        self.is_training = is_training
        self.inputs = tf.placeholder(tf.int32, (None, None),
                                     name='graphemes_ph')
        self.input_lengths = tf.placeholder(tf.int32, [None],
                                            name='grapeheme_seq_len_ph')
        if with_target:
            self.targets = tf.placeholder(tf.int32, (None, None),
                                          name='phonemes_ph')
            self.target_lengths = tf.placeholder(tf.int32, [None],
                                                 name='phoneme_seq_len_ph')

        with tf.variable_scope('g2p', reuse=reuse):
            embedding_table = tf.get_variable(
                'embedding', [hparams.graphemes_num, hparams.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            outputs = tf.nn.embedding_lookup(embedding_table, self.inputs)

            if hparams.with_conv:
                for i in range(hparams.conv_num):
                    outputs = conv1d(outputs, hparams.conv_width,
                                     hparams.conv_channels, tf.nn.relu,
                                     is_training, hparams.dropout_rate,
                                     'conv_%d' % i)

            forward_cell = rnn_cell(hparams.encoder_lstm_units // 2, hparams,
                                    is_training)
            backward_cell = rnn_cell(hparams.encoder_lstm_units // 2, hparams,
                                     is_training)
            outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
                forward_cell,
                backward_cell,
                outputs,
                sequence_length=self.input_lengths,
                dtype=tf.float32,
                scope='bilstm')

            # Concatentate forward and backwards:
            encoder_outputs = tf.concat(outputs, axis=2)

            decoder_cell = MultiRNNCell([
                rnn_cell(hparams.decoder_lstm_units, hparams, is_training),
                rnn_cell(hparams.decoder_lstm_units, hparams, is_training)
            ],
                                        state_is_tuple=True)
            decoder_embeddings = tf.get_variable(
                name='decoder_embeddings',
                shape=[hparams.phonemes_num, hparams.decoder_embedding_dim],
                dtype=tf.float32)

            if is_training:
                batch_size = tf.shape(self.inputs)[0]
                attention_cell = self.create_attention_cell(
                    hparams.attention_depth,
                    encoder_outputs,
                    self.input_lengths,
                    decoder_cell,
                    alignment_history=False)
                attention_cell = OutputProjectionWrapper(
                    attention_cell, hparams.phonemes_num)
                targets_shifted = self.targets[:, :-1]
                targets_emb = tf.nn.embedding_lookup(decoder_embeddings,
                                                     targets_shifted)
                helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=targets_emb, sequence_length=self.target_lengths)
                #decoder_initial_state = attention_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)
                decoder_initial_state = attention_cell.zero_state(
                    batch_size, tf.float32)
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    attention_cell, helper, decoder_initial_state)
                outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder)
                self.decoded_best = tf.identity(outputs.sample_id,
                                                name='predicted_1best')
                self.logits = outputs.rnn_output
                self.probs = tf.nn.softmax(self.logits, name='probs')
            else:
                if self.hparams.beam_width == 1:
                    batch_size = tf.shape(self.inputs)[0]
                    attention_cell = self.create_attention_cell(
                        hparams.attention_depth,
                        encoder_outputs,
                        self.input_lengths,
                        decoder_cell,
                        alignment_history=False)
                    attention_cell = OutputProjectionWrapper(
                        attention_cell, hparams.phonemes_num)
                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                        embedding=decoder_embeddings,
                        start_tokens=tf.fill([batch_size],
                                             hparams.phonemes_num - 2),
                        end_token=hparams.phonemes_num - 1)
                    decoder_initial_state = attention_cell.zero_state(
                        batch_size, tf.float32)
                    decoder = tf.contrib.seq2seq.BasicDecoder(
                        attention_cell, helper, decoder_initial_state)
                    outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode(
                        decoder,
                        maximum_iterations=self.hparams.max_phoneme_seq_len)
                    self.decoded_best = tf.identity(outputs.sample_id,
                                                    name='predicted_1best')
                    self.logits = outputs.rnn_output
                    self.probs = tf.nn.softmax(self.logits, name='probs')
                else:
                    batch_size = tf.shape(self.inputs)[0]
                    start_tokens = tf.fill([batch_size],
                                           hparams.phonemes_num - 2)
                    batch_size = batch_size * hparams.beam_width
                    encoder_outputs = tf.contrib.seq2seq.tile_batch(
                        encoder_outputs, multiplier=hparams.beam_width)
                    input_lengths_tile = tf.contrib.seq2seq.tile_batch(
                        self.input_lengths, multiplier=hparams.beam_width)
                    encoder_state = tf.contrib.seq2seq.tile_batch(
                        encoder_state, multiplier=hparams.beam_width)

                    attention_cell = self.create_attention_cell(
                        hparams.attention_depth,
                        encoder_outputs,
                        input_lengths_tile,
                        decoder_cell,
                        alignment_history=False)
                    attention_cell = OutputProjectionWrapper(
                        attention_cell, hparams.phonemes_num)
                    decoder_initial_state = attention_cell.zero_state(
                        batch_size, tf.float32)
                    decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                        cell=attention_cell,
                        embedding=decoder_embeddings,
                        start_tokens=start_tokens,
                        end_token=hparams.phonemes_num - 1,
                        initial_state=decoder_initial_state,
                        beam_width=hparams.beam_width,
                        output_layer=None,
                        length_penalty_weight=hparams.length_penalty)
                    outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode(
                        decoder, maximum_iterations=hparams.max_iters)
                    self.logits = tf.no_op()
                    print(
                        '**Warning! You could not be able to build lattice with beam_width > 1'
                    )
                    self.probs = tf.no_op()
                    # best beam
                    self.decoded_best = tf.identity(outputs.predicted_ids[:, :,
                                                                          0],
                                                    name='predicted_1best')
Exemple #16
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   eal=False,
                   locked_alignments=None,
                   logs_enabled=True,
                   flag_trainAlign=False,
                   flag_trainJoint=False,
                   alignScale=1.0):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments
        self.flag_trainAlign = flag_trainAlign
        self.flag_trainJoint = flag_trainJoint
        self.alignScale = alignScale

        if locked_alignments_ is not None:
            if is_training and eal:
                pass
            elif np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper',
                flag_trainAlign=self.flag_trainAlign,
                flag_trainJoint=self.flag_trainJoint
            )  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                if gta:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
                elif eal:
                    helper = TacoTrainingHelper_EAL(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
                elif hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    log('For training, one of these should be true: gta, eal, hp.scheduled_sampling'
                        )
            else:
                if gta:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
                elif eal:
                    helper = TacoTrainingHelper_EAL(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
                else:
                    helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                            hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                pml_intermediates,
                hp.pml_dimension,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            pml_outputs = tf.layers.dense(post_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell
            self.locked_alignments = locked_alignments_

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(gta))
                log('  EAL mode:                {}'.format(eal))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Prenet out:              {}'.format(
                    prenet_outputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Postnet out:             {}'.format(
                    post_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
Exemple #17
0
    def initialize(self, inputs, vgg19_model_path, mel_targets=None, linear_targets=None):
        """Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          vgg19_model_path: File path to the npy file containing pretrained weights of the VGG19 model
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        """
        with tf.variable_scope('inference') as _:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # VGG19
            self.vgg19_pretrained = Vgg19(vgg19_model_path)
            vgg_output = tf.map_fn(self.__preprocess_before_vgg19, inputs)

            last_fc_output_size = tf.shape(vgg_output)[1]
            input_lengths = tf.tile([last_fc_output_size], [batch_size])

            # Encoder
            prenet_outputs = prenet(vgg_output, is_training, hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training,  # [N, T_in, encoder_depth=256]
                                           hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                ResidualWrapper(GRUCell(hp.decoder_depth)),
                ResidualWrapper(GRUCell(hp.decoder_depth))
            ], state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training,  # [N, T_out, postnet_depth=256]
                                     hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #18
0
    def __init__(self,
                 voice,
                 is_training,
                 eval_batch_size=1,
                 write_debug_files=False,
                 voice_path=VOICE_PATH,
                 tf_device=DEFAULT_DEVICE):

        self.voice = voice
        self.voice_path = voice_path % voice

        self.write_debug_files = write_debug_files

        self.hpfn = '%s/hparams.json' % self.voice_path
        with codecs.open(self.hpfn, 'r', 'utf8') as hpf:
            self.hp = json.loads(hpf.read())
        self.batch_size = self.hp[
            'batch_size'] if is_training else eval_batch_size

        max_num_frames = self.hp['max_iters'] * self.hp[
            'outputs_per_step'] * self.hp['frame_shift_ms'] * self.hp[
                'sample_rate'] / 1000
        n_fft, hop_length, win_length = audio.stft_parameters(self.hp)
        self.max_mfc_frames = 1 + int((max_num_frames - n_fft) / hop_length)

        with tf.device(tf_device):

            # self.inputs        = tf.placeholder(dtype = tf.int32, shape = [None, self.hp['max_inp_len']])
            # self.input_lengths = tf.placeholder(dtype = tf.int32, shape = [None])
            self.inputs = tf.placeholder(
                dtype=tf.int32,
                shape=[self.batch_size, self.hp['max_inp_len']])
            self.input_lengths = tf.placeholder(dtype=tf.int32,
                                                shape=[self.batch_size])
            logging.debug('inputs: %s' % self.inputs)
            logging.debug('input_lengths: %s' % self.input_lengths)

            # self.mel_targets    = tf.placeholder(tf.float32, [None, self.max_mfc_frames, self.hp['num_mels']], 'mel_targets')
            # self.linear_targets = tf.placeholder(tf.float32, [None, self.max_mfc_frames, self.hp['num_freq']], 'linear_targets')
            # self.target_lengths = tf.placeholder(tf.int32,   [None],                                           'target_lengths')
            self.mel_targets = tf.placeholder(
                tf.float32,
                [self.batch_size, self.max_mfc_frames, self.hp['num_mels']],
                'mel_targets')
            self.linear_targets = tf.placeholder(
                tf.float32,
                [self.batch_size, self.max_mfc_frames, self.hp['num_freq']],
                'linear_targets')
            self.target_lengths = tf.placeholder(tf.int32, [self.batch_size],
                                                 'target_lengths')
            logging.debug('mel_targets: %s' % self.mel_targets)
            logging.debug('linear_targets: %s' % self.linear_targets)
            logging.debug('targets_lengths: %s' % self.target_lengths)

        # Embeddings
        embedding_table = tf.get_variable(
            'embedding', [len(self.hp['alphabet']), self.hp['embed_depth']],
            dtype=tf.float32,
            initializer=tf.truncated_normal_initializer(stddev=0.5))
        logging.debug('embedding_table: %s' % embedding_table)
        embedded_inputs = tf.nn.embedding_lookup(
            embedding_table, self.inputs)  # [N, max_inp_len, 256]

        logging.debug('embedded_inputs: %s' % embedded_inputs)

        # Encoder
        prenet_outputs = _create_prenet(
            embedded_inputs, is_training,
            self.hp['prenet_depths'])  # [N, max_inp_len, 128]
        logging.debug('prenet_outputs: %s' % prenet_outputs)

        encoder_outputs = _create_encoder_cbhg(
            prenet_outputs,
            self.input_lengths,
            is_training,  # [N, max_inp_len, 256]
            self.hp['encoder_depth'])
        logging.debug('encoder_outputs: %s' % encoder_outputs)

        # Attention

        attention_cell = AttentionWrapper(
            GRUCell(self.hp['attention_depth']),
            BahdanauAttention(self.hp['attention_depth'], encoder_outputs),
            alignment_history=True,
            output_attention=False)  # [N, T_in, attention_depth=256]
        logging.debug('attention_cell: %s' % attention_cell)

        # Apply prenet before concatenation in AttentionWrapper.
        attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                              self.hp['prenet_depths'])
        logging.debug('attention_cell: %s' % attention_cell)

        # Concatenate attention context vector and RNN cell output into a 512D vector.
        concat_cell = ConcatOutputAndAttentionWrapper(
            attention_cell)  # [N, max_inp_len, 512]
        logging.debug('concat_cell: %s' % concat_cell)

        # Decoder (layers specified bottom to top):
        decoder_cell = MultiRNNCell(
            [
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
            state_is_tuple=True)  # [N, max_inp_len, 256]
        logging.debug('decoder_cell: %s' % decoder_cell)

        # T_in                               -> max_inp_len
        # M           -> hp.num_mels
        # r           -> hp.outputs_per_step
        #                mel_targets         -> frame_targets
        #                max_iters           -> max_iters

        # Project onto r mel spectrograms (predict r outputs at each RNN step):
        output_cell = OutputProjectionWrapper(
            decoder_cell, self.hp['num_mels'] * self.hp['outputs_per_step'])
        logging.debug('output_cell: %s' % output_cell)

        decoder_init_state = output_cell.zero_state(batch_size=self.batch_size,
                                                    dtype=tf.float32)
        logging.debug('decoder_init_state: %s' % repr(decoder_init_state))

        if is_training:
            helper = TacoTrainingHelper(self.inputs, self.mel_targets,
                                        self.hp['num_mels'],
                                        self.hp['outputs_per_step'],
                                        self.target_lengths)
        else:
            helper = TacoTestHelper(self.batch_size, self.hp['num_mels'],
                                    self.hp['outputs_per_step'])
        logging.debug('helper: %s' % helper)

        (decoder_outputs,
         _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
             BasicDecoder(output_cell, helper, decoder_init_state),
             maximum_iterations=self.hp['max_iters'])  # [N, T_out/r, M*r]
        logging.debug('decoder_outputs: %s' % decoder_outputs)
        logging.debug('final_decoder_state: %s' % repr(final_decoder_state))

        # Reshape outputs to be one output per entry
        self.mel_outputs = tf.reshape(
            decoder_outputs,
            [self.batch_size, -1, self.hp['num_mels']])  # [N, T_out, M]
        logging.debug('mel_outputs: %s' % self.mel_outputs)

        # Add post-processing CBHG:
        post_outputs = _create_post_cbhg(
            self.mel_outputs,  # [N, T_out, postnet_depth=256]
            self.hp['num_mels'],
            is_training,
            self.hp['postnet_depth'])
        logging.debug('post_outputs: %s' % post_outputs)
        self.linear_outputs = tf.layers.dense(
            post_outputs, self.hp['num_freq'])  # [N, T_out, F]
        logging.debug('linear_outputs: %s' % self.linear_outputs)

        # Grab alignments from the final decoder state:
        self.alignments = tf.transpose(
            final_decoder_state[0].alignment_history.stack(), [1, 2, 0])
        logging.debug('alignments: %s' % self.alignments)

        if is_training:

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            with tf.variable_scope('loss') as scope:
                mel_loss = tf.reduce_mean(
                    tf.abs(self.mel_targets - self.mel_outputs))
                l1 = tf.abs(self.linear_targets - self.linear_outputs)
                # Prioritize loss for frequencies under 3000 Hz.
                n_priority_freq = int(3000 / (self.hp['sample_rate'] * 0.5) *
                                      self.hp['num_freq'])
                linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
                    l1[:, :, 0:n_priority_freq])
                self.loss = mel_loss + linear_loss

            with tf.variable_scope('optimizer') as scope:
                learning_rate = tf.train.exponential_decay(
                    self.hp['initial_learning_rate'], self.global_step,
                    self.hp['learning_rate_decay_halflife'], 0.5)
                optimizer = tf.train.AdamOptimizer(learning_rate,
                                                   self.hp['adam_beta1'],
                                                   self.hp['adam_beta2'])
                gradients, variables = zip(
                    *optimizer.compute_gradients(self.loss))
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
                # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
                # https://github.com/tensorflow/tensorflow/issues/1122
                with tf.control_dependencies(
                        tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                    self.optimize = optimizer.apply_gradients(
                        zip(clipped_gradients, variables),
                        global_step=self.global_step)

        self.saver = tf.train.Saver(max_to_keep=5,
                                    keep_checkpoint_every_n_hours=2)

        self.sess = tf.Session()

        self.cpfn = '%s/cp' % self.voice_path

        # find latest checkpoint

        latest_cp = tf.train.latest_checkpoint(self.cpfn)
        self.epoch_start = 0
        if latest_cp:
            logging.debug('restoring variables from %s ...' % latest_cp)
            self.saver.restore(self.sess, latest_cp)

            # extract epoch number from filename
            self.epoch_start = int(
                os.path.basename(latest_cp).split('-')[0][2:]) + 1

        else:
            self.cpfn = '%s/model' % self.voice_path
            if os.path.exists('%s.index' % self.cpfn):
                logging.debug('restoring variables from %s ...' % self.cpfn)
                self.saver.restore(self.sess, self.cpfn)
            else:
                if is_training:
                    logging.debug(
                        'couldn\'t restore variables from %s -> initializing fresh training run.'
                        % self.cpfn)
                    self.sess.run(tf.global_variables_initializer())
                else:
                    raise Exception("couldn't load model from %s" % self.cpfn)
Exemple #19
0
    def initialize(self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            self.batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, 512]

            # Encoder
            encoder_outputs = conv_and_lstm(
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units=hp.encoder_lstm_units,
                is_training=is_training,
                scope='encoder')  # [N, T_in, 512]

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training),
                LocationSensitiveAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 128]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                concat_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)
            ], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32)
            (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry                                [N, T_out, M]
            decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels])

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                decoder_outputs,
                layers=hp.postnet_conv_layers,
                conv_width=hp.postnet_conv_width,
                channels=hp.postnet_conv_channels,
                is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                mel_outputs,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                lstm_units=hp.expand_lstm_units,
                is_training=is_training,
                scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_outputs = decoder_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  expand out:              %d' % expand_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
  def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      embedding_table = tf.get_variable(
        'embedding', [len(symbols), 256], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))
      embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]

      # Global style tokens (GST), When using h attention heads, we set
      # the token embedding size to be 256/h and concatenate the attention
      # outputs of each head.
      gst_tokens = tf.get_variable(
        'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))

      # Encoder
      prenet_outputs = prenet(embedded_inputs, is_training)                       # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]

      if is_training:
        # Reference encoder
        reference_embedding = reference_encoder(
          mel_targets,
          filters=[32, 32, 64, 64, 128, 128],
          kernel_size=(3, 3),
          strides=(2, 2),
          is_training=is_training)

        # Style token layer
        style_embedding = multi_head_attention(
          num_heads=hp.num_heads,
          queries=tf.expand_dims(reference_embedding, axis=1),                    # [N, 1, 128]
          memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256 // hp.num_heads]
          num_units=128)
      else:
        # TODO Add support for reference mode and more effective style control during inference.
        # Randomly select style embedding from gst_tokens for simplicity.
        random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32)
        style_embedding = tf.nn.embedding_lookup(gst_tokens, random_index)

      # Add style embedding to every text encoder state, applying tanh to
      # compress both encoder state and style embedding to the same scale.
      encoder_outputs += tf.nn.tanh(style_embedding)

      # Attention
      attention_cell = AttentionWrapper(
        DecoderPrenetWrapper(GRUCell(256), is_training),
        BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output into a 512D vector.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              # [N, T_in, 512]

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, 256),
        ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)),
        ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)),
      ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training:
        helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
      else:
        helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

      if is_training:
        (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
          BasicDecoder(output_cell, helper, decoder_init_state))
      else:
        (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
          BasicDecoder(output_cell, helper, decoder_init_state),
          maximum_iterations=hp.max_iters)                                      # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      tf.logging.info('Initialized Tacotron model. Dimensions: ')
      tf.logging.info('  embedding:               %d' % embedded_inputs.shape[-1])
      tf.logging.info('  prenet out:              %d' % prenet_outputs.shape[-1])
      tf.logging.info('  encoder out:             %d' % encoder_outputs.shape[-1])
      tf.logging.info('  attention out:           %d' % attention_cell.output_size)
      tf.logging.info('  concat attn & out:       %d' % concat_cell.output_size)
      tf.logging.info('  decoder cell out:        %d' % decoder_cell.output_size)
      tf.logging.info('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      tf.logging.info('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      tf.logging.info('  postnet out:             %d' % post_outputs.shape[-1])
      tf.logging.info('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id=None,
                   mel_targets=None,
                   linear_targets=None,
                   is_training=False,
                   loss_coeff=None,
                   stop_token_targets=None):

        with tf.variable_scope('Eembedding') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                speaker_embed_table = tf.get_variable(
                    'speaker_embedding',
                    [self.num_speakers, hp.speaker_embedding_size],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                # [N, T_in, speaker_embedding_size]
                speaker_embed = tf.nn.embedding_lookup(speaker_embed_table,
                                                       speaker_id)

                deep_dense = lambda x, dim, name: tf.layers.dense(
                    x, dim, activation=tf.nn.softsign, name=name
                )  # softsign: x / (abs(x) + 1)

                encoder_rnn_init_state = deep_dense(
                    speaker_embed, hp.encoder_lstm_units * 4,
                    'encoder_init_dense')  # hp.encoder_lstm_units = 256

                decoder_rnn_init_states = [
                    deep_dense(speaker_embed, hp.decoder_lstm_units * 2,
                               'decoder_init_dense_{}'.format(i))
                    for i in range(hp.decoder_layers)
                ]  # hp.decoder_lstm_units = 1024

                speaker_embed = None
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

        with tf.variable_scope('Encoder') as scope:
            ##############
            # Encoder
            ##############
            x = char_embedded_inputs
            for i in range(hp.enc_conv_num_layers):
                x = tf.layers.conv1d(x,
                                     filters=hp.enc_conv_channels,
                                     kernel_size=hp.enc_conv_kernel_size,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='dropout_{}'.format(i))

            if encoder_rnn_init_state is not None:
                initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split(
                    encoder_rnn_init_state, 4, 1)
                initial_state_fw = LSTMStateTuple(initial_state_fw_c,
                                                  initial_state_fw_h)
                initial_state_bw = LSTMStateTuple(initial_state_bw_c,
                                                  initial_state_bw_h)
            else:  # single mode
                initial_state_fw, initial_state_bw = None, None

            cell_fw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            ##############
            # Attention
            ##############
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=hp.mask_encoder,
                    memory_sequence_length=input_lengths,
                    smoothing=hp.smoothing,
                    cumulate_weights=hp.cumulative_weights)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            decoder_lstm = [
                ZoneoutLSTMCell(hp.decoder_lstm_units,
                                is_training,
                                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                                zoneout_factor_output=hp.tacotron_zoneout_rate,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(hp.decoder_layers)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "multi-speaker":

                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx][0].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1[1] * 2 != shape2[1]:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    c, h = tf.split(cell, 2, 1)
                    decoder_init_state[idx] = LSTMStateTuple(c, h)

                decoder_init_state = tuple(decoder_init_state)

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_prenet_outputs = DecoderWrapper(attention_cell, is_training,
                                                hp.dec_prenet_sizes,
                                                hp.dropout_prob,
                                                hp.inference_prenet_dropout)

            dec_outputs_cell = OutputProjectionWrapper(
                dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor)

            if is_training:
                helper = TacoTrainingHelper(
                    mel_targets, hp.num_mels,
                    hp.reduction_factor)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor))  # max_iters=200

            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor],
                [batch_size, -1, hp.num_mels
                 ])  # [N,iters,400] -> [N,5*iters,80]
            stop_token_outputs = tf.reshape(
                decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:],
                [batch_size, -1])  # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(hp.postnet_num_layers):
                activation = tf.nn.tanh if i != (hp.postnet_num_layers -
                                                 1) else None
                x = tf.layers.conv1d(x,
                                     filters=hp.postnet_channels,
                                     kernel_size=hp.postnet_kernel_size,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq,
                name='linear_spectogram_projection')  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state
            self.stop_token_targets = stop_token_targets
            self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            log('    encoder conv out:               %d' %
                encoder_conv_output.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    decoder prenet lstm concat out :        %d' %
                dec_prenet_outputs.output_size)
            log('    decoder cell out:         %d' %
                dec_outputs_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder mel out:    %d' % decoder_mel_outputs.shape[-1])
            log('    mel out:    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #22
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embedding
            embedding_table = tf.get_variable(
                'embedding', [hp.len_symbols, hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_gru(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                gru_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            cells = [
                GRUCell(hp.decoder_gru_units)
                for _ in range(hp.decoder_gru_layers)
            ]
            decoder_cell = MultiRNNCell(
                [concat_cell] + cells,
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                if hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and GRU layer:
            expand_outputs = conv_and_gru(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                gru_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(is_training))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Expand out:              {}'.format(
                    expand_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
Exemple #24
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None):
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            if hp.use_gst:
                #Global style tokens (GST)
                gst_tokens = tf.get_variable(
                    'style_tokens', [hp.num_gst, 256 // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(128),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                if hp.use_gst:
                    # Style attention
                    style_attention = MultiheadAttention(
                        tf.tanh(tf.expand_dims(refnet_outputs,
                                               axis=1)),  # [N, 1, 128]
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ]),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=128,
                        attention_type=hp.style_att_type)

                    # Apply tanh to compress both encoder state and style embedding to the same scale.
                    style_embeddings = style_attention.multi_head_attention(
                    )  # [N, 1, 256]
                else:
                    style_embeddings = tf.expand_dims(refnet_outputs,
                                                      axis=1)  # [N, 1, 128]
            else:
                print("Use random weight for GST.")
                random_weights = tf.random_uniform([hp.num_heads, hp.num_gst],
                                                   maxval=1.0,
                                                   dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               name="random_weights")
                style_embeddings = tf.matmul(random_weights,
                                             tf.nn.tanh(gst_tokens))
                style_embeddings = tf.reshape(
                    style_embeddings, [1, 1] +
                    [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                style_embeddings,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            encoder_outputs = tf.concat([encoder_outputs, style_embeddings],
                                        axis=-1)

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or is_teacher_force_generating:
                helper = TrainingHelper(inputs, mel_targets, hp.num_mels,
                                        hp.outputs_per_step)
            else:
                helper = TestHelper(batch_size, hp.num_mels,
                                    hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # # Grab alignments from the final decoder state:
            # alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            # self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
Exemple #25
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            #Global style tokens (GST)
            gst_tokens = tf.get_variable(
                'style_tokens',
                [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            self.gst_tokens = gst_tokens

            # Encoder

            encoder_outputs = encoder(embedded_inputs, input_lengths,
                                      is_training, 512, 5,
                                      256)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=hp.ref_filters,
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(hp.ref_depth),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                # Style attention
                style_attention = MultiheadAttention(
                    tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                    tf.tanh(
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                    num_heads=hp.num_heads,
                    num_units=hp.style_att_dim,
                    attention_type=hp.style_att_type)

                embedded_tokens = style_attention.multi_head_attention(
                )  # [N, 1, 256]

            else:
                random_weights = tf.constant(
                    hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] *
                                    (hp.num_gst - hp.gst_index)],
                    dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               name="random_weights")
                # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads])
                embedded_tokens = tf.matmul(random_weights,
                                            tf.nn.tanh(gst_tokens))
                embedded_tokens = hp.gst_scale * embedded_tokens
                embedded_tokens = tf.reshape(
                    embedded_tokens, [1, 1] +
                    [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                embedded_tokens,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            encoder_outputs = tf.concat([encoder_outputs, style_embeddings],
                                        axis=-1)

            # Attention
            attention_mechanism = LocationSensitiveAttention(
                128,
                encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=True,
                memory_sequence_length=input_lengths,
                smoothing=False,
                cumulate_weights=True)
            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32)  #tensorflow1에는 없음

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                attention_cell, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)
            else:
                helper = TacoTestHelper(batch_size, hp)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            linear_outputs = tf.layers.dense(
                post_outputs,
                hp.num_freq)  # [N, T_out, F(1025)]             # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            # log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #26
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            if hp.use_vae:
                style_embeddings, mu, log_var = VAE(inputs=mel_targets,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #27
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            # embedding_table = tf.get_variable(
            #   'embedding', [len(symbols), 256], dtype=tf.float32,
            #   initializer=tf.truncated_normal_initializer(stddev=0.5))
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
            # embedded_inputs = inputs

            # Encoder
            # n_fft = (self._hparams.num_src_freq - 1) * 2
            # in_layer_size = n_fft
            in_layer_size = self._hparams.num_src_freq
            prenet_outputs = prenet(inputs,
                                    is_training,
                                    layer_sizes=[in_layer_size,
                                                 128])  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  input:                   %d' % inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #28
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   stop_token_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

        with tf.variable_scope('Encoder') as scope:

            x = embedded_inputs

            #3 Conv Layers
            for i in range(3):
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='dropout_{}'.format(i))
            encoder_conv_output = x

            #bi-directional LSTM
            cell_fw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_bw_LSTM')

            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            if hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    128,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=True,
                    memory_sequence_length=input_lengths,
                    smoothing=False,
                    cumulate_weights=True)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    128,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'step_bah':
                attention_mechanism = BahdanauStepwiseMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    mode="parallel")
            elif hp.attention_type == 'mon_bah':
                attention_mechanism = BahdanauMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loung':
                attention_mechanism = LuongAttention(
                    128, encoder_outputs, memory_sequence_length=input_lengths)

            # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
            #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
            #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)

            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음

            attention_cell = AttentionWrapper(decoder_lstm,
                                              attention_mechanism,
                                              alignment_history=True,
                                              output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_outputs = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                dec_outputs, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]
            #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training,
                                     hp.postnet_depth)
            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            #self.stop_token_targets = stop_token_targets
            #self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            #log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #29
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings

            symbols_length = 149  # BASED ON PREVIOUS LENGTH OF LIST

            embedding_table = tf.get_variable(
                'embedding', [symbols_length, hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            helper = TacoTestHelper(batch_size, hp.num_mels,
                                    hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
Exemple #30
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
        """
        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_lstm(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = AttentionWrapper(  # [N, T_in, attention_depth=256]
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth),
                                     is_training, hp.prenet_depths),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    concat_cell,
                    LSTMBlockCell(hp.decoder_gru_units),
                    LSTMBlockCell(hp.decoder_gru_units)
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  Train mode:              {}'.format(is_training))
            log('  GTA mode:                {}'.format(is_training))
            log('  Embedding:               {}'.format(
                embedded_inputs.shape[-1]))
            log('  Encoder out:             {}'.format(
                encoder_outputs.shape[-1]))
            log('  Attention out:           {}'.format(
                attention_cell.output_size))
            log('  Concat attn & out:       {}'.format(
                concat_cell.output_size))
            log('  Decoder cell out:        {}'.format(
                decoder_cell.output_size))
            log('  Decoder out ({} frames):  {}'.format(
                hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  PML out:                 {}'.format(pml_outputs.shape[-1]))