Ejemplo n.º 1
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Global style tokens (GST)
            gst_tokens = tf.get_variable(
                'style_tokens', [hp.num_gst, 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                # Reference encoder
                reference_embedding = reference_encoder(
                    mel_targets,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    is_training=is_training)

                # Style token layer
                style_embedding = multi_head_attention(
                    num_heads=hp.num_heads,
                    queries=tf.expand_dims(reference_embedding,
                                           axis=1),  # [N, 1, 128]
                    memory=tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                   [batch_size, 1, 1]),  # [N, hp.num_gst, 256]
                    num_units=128)
            else:
                # TODO Add support for reference mode and more effective style control during inference.
                # Randomly select style embedding from gst_tokens for simplicity.
                random_index = tf.random_uniform([batch_size],
                                                 maxval=hp.num_gst,
                                                 dtype=tf.int32)
                style_embedding = tf.nn.embedding_lookup(
                    gst_tokens, random_index)

            # Add style embedding to every text encoder state, applying tanh to
            # compress both encoder state and style embedding to the same scale.
            encoder_outputs += tf.nn.tanh(style_embedding)

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(
                    ZoneoutWrapper(LSTMBlockCell(256),
                                   (0.1, 0.1), is_training)),
                ResidualWrapper(
                    ZoneoutWrapper(LSTMBlockCell(256),
                                   (0.1, 0.1), is_training)),
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            if is_training:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state))
            else:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            tf.logging.info('Initialized Tacotron model. Dimensions: ')
            tf.logging.info('  embedding:               %d' %
                            embedded_inputs.shape[-1])
            tf.logging.info('  prenet out:              %d' %
                            prenet_outputs.shape[-1])
            tf.logging.info('  encoder out:             %d' %
                            encoder_outputs.shape[-1])
            tf.logging.info('  attention out:           %d' %
                            attention_cell.output_size)
            tf.logging.info('  concat attn & out:       %d' %
                            concat_cell.output_size)
            tf.logging.info('  decoder cell out:        %d' %
                            decoder_cell.output_size)
            tf.logging.info('  decoder out (%d frames):  %d' %
                            (hp.outputs_per_step, decoder_outputs.shape[-1]))
            tf.logging.info('  decoder out (1 frame):   %d' %
                            mel_outputs.shape[-1])
            tf.logging.info('  postnet out:             %d' %
                            post_outputs.shape[-1])
            tf.logging.info('  linear out:              %d' %
                            linear_outputs.shape[-1])
Ejemplo n.º 2
0
    def build_decoder_cell(self):
        """构建解码器cell"""

        encoder_outputs = self.encoder_outputs
        encoder_last_state = self.encoder_last_state
        encoder_inputs_length = self.encoder_inputs_length

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # To use BeamSearchDecoder
        # encoder_outputs, encoder_last_state, encoder_inputs_length
        # needs to be tiled so that:
        # [batch_size, .., ..] -> [batch_size x beam_width, .., ..]
        if self.use_beamsearch_decode:

            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_last_state = nest.map_structure(
                lambda s: seq2seq.tile_batch(s, self.beam_width),
                self.encoder_last_state)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)

        # 计算解码器的隐藏神经元数,如果编码器是 bidirectional 的
        # 那么解码器的一些隐藏神经元应该乘2
        num_units = self.hidden_units
        if self.bidirectional:
            num_units *= 2

        # Building attention mechanism: Default Bahdanau
        # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
        self.attention_mechanism = BahdanauAttention(
            num_units=num_units,
            memory=encoder_outputs,
            memory_sequence_length=encoder_inputs_length)
        # 'Luong' style attention: https://arxiv.org/abs/1508.04025
        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        # Building decoder_cell
        self.decoder_cell_list = [
            self.build_single_cell(num_units, use_residual=True)
            for i in range(self.depth)
        ]

        decoder_initial_state = encoder_last_state

        def attn_decoder_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
            """
            if not self.attn_input_feeding:
                return inputs

            # Essential when use_residual=True
            hidden_units = self.hidden_units
            if self.bidirectional:
                hidden_units *= 2
            input_layer = layers.Dense(hidden_units,
                                       dtype=tf.float32,
                                       use_bias=False,
                                       name='attn_input_feeding')
            return input_layer(array_ops.concat([inputs, attention], -1))

        # AttentionWrapper wraps RNNCell with the attention_mechanism
        # Note: We implement Attention mechanism only on the top decoder layer
        self.decoder_cell_list[-1] = AttentionWrapper(
            cell=self.decoder_cell_list[-1],
            attention_mechanism=self.attention_mechanism,
            # attention_layer_size=self.hidden_units,
            attention_layer_size=num_units,
            cell_input_fn=attn_decoder_input_fn,
            initial_cell_state=encoder_last_state[-1],
            alignment_history=self.alignment_history,
            name='Attention_Wrapper')

        # To be compatible with AttentionWrapper, the encoder last state
        # of the top layer should be converted
        # into the AttentionWrapperState form
        # We can easily do this by calling AttentionWrapper.zero_state

        # Also if beamsearch decoding is used,
        # the batch_size argument in .zero_state
        # should be ${decoder_beam_width} times to the origianl batch_size
        # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的
        batch_size = self.batch_size if not self.use_beamsearch_decode \
                     else self.batch_size * self.beam_width
        initial_state = [state for state in encoder_last_state]

        initial_state[-1] = self.decoder_cell_list[-1].zero_state(
            batch_size=batch_size, dtype=tf.float32)
        decoder_initial_state = tuple(initial_state)

        return MultiRNNCell(self.decoder_cell_list), decoder_initial_state
Ejemplo n.º 3
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None,
                   reference_weight=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            if hp.use_gst:
                #Global style tokens (GST)
                gst_tokens = tf.get_variable(
                    'style_tokens',
                    [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=hp.reference_filters,
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(hp.reference_depth),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                if hp.use_gst:
                    # Style attention
                    style_attention = MultiheadAttention(
                        tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                        tf.tanh(
                            tf.tile(
                                tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=hp.style_att_dim,
                        attention_type=hp.style_att_type)

                    style_weights, style_embeddings = style_attention.multi_head_attention(
                    )  # [N, 1, 256]
                else:
                    style_embeddings = tf.expand_dims(refnet_outputs,
                                                      axis=1)  # [N, 1, 128]
            elif reference_weight is not None:
                print("Use specific weight for GST.")
                specific_weights = tf.expand_dims(reference_weight, axis=0)
                specific_weights = tf.tile(specific_weights, [hp.num_heads, 1],
                                           name="specific_weights")
                # specific_weights = tf.tile(specific_weights, [hp.num_heads, 1])
                # specific_weights = tf.nn.softmax(specific_weights, axis=-1, name="specific_weights")
                style_embeddings = tf.matmul(specific_weights,
                                             tf.nn.tanh(gst_tokens))
                style_embeddings = tf.expand_dims(style_embeddings, axis=0)
                style_embeddings = tf.tile(style_embeddings,
                                           [batch_size, 1, 1])
                style_embeddings = tf.reshape(
                    style_embeddings,
                    shape=[batch_size, 1, hp.style_embed_depth])
                style_weights = tf.expand_dims(specific_weights, axis=0)
            else:
                print("Use random weight for GST.")
                random_weights = tf.random_uniform([hp.num_heads, hp.num_gst],
                                                   maxval=1.0,
                                                   dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               axis=-1,
                                               name="random_weights")
                style_embeddings = tf.matmul(random_weights,
                                             tf.nn.tanh(gst_tokens))
                style_embeddings = tf.expand_dims(style_embeddings, axis=0)
                style_embeddings = tf.tile(style_embeddings,
                                           [batch_size, 1, 1])
                style_embeddings = tf.reshape(
                    style_embeddings,
                    shape=[batch_size, 1, hp.style_embed_depth])
                style_weights = tf.expand_dims(random_weights, axis=0)

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                style_embeddings,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            # encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1)
            encoder_outputs = encoder_outputs + style_embeddings

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or is_teacher_force_generating:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)
            else:
                helper = TacoTestHelper(batch_size, hp)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_weights = style_weights
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            self.reference_weight = reference_weight
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Ejemplo n.º 4
0
encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

encoder_final_state = LSTMStateTuple(c=encoder_final_state_c,
                                     h=encoder_final_state_h)

#Shape: (batch_size, time_step, hidden_units)
encoder_outputs = tf.transpose(
    tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1, 0, 2])

decoder_cell = LSTMCell(hidden_units * 2)

attention_mechanism = BahdanauAttention(attention_units, encoder_outputs)
attention_cell = AttentionWrapper(decoder_cell, attention_mechanism)

copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids,
                              vocab_size, gen_vocab_size)

decoder_initial_state = copynet_cell.zero_state(
    batch_size, tf.float32).clone(cell_state=attention_cell.zero_state(
        batch_size=batch_size, dtype=tf.float32))

helper = tf.contrib.seq2seq.TrainingHelper(targets_embedded,
                                           targets_lengths,
                                           time_major=True)
#helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0)

decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell,
Ejemplo n.º 5
0
    def initialize(self,
                   inputs,
                   inputs_jp,
                   input_lengths,
                   input_jp_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings

            # [N, T_in, embed_depth=256]

            # Encoder
            #prenet_outputs = prenet(inputs, is_training, hp.prenet_depths)    # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                inputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)
            # print(inputs_jp.eval)
            # print(inputs.eval)
            # print(input_jp_lengths.eval)
            # print(input_lengths.eval)
            encoder_outputs_jp = encoder_cbhg_jp(
                inputs_jp,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)
            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training,
                                     hp.prenet_depths),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Attention JP
            attention_cell_jp = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training,
                                     hp.prenet_depths),
                BahdanauAttention(hp.attention_depth, encoder_outputs_jp),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell_jp = ConcatOutputAndAttentionWrapper(
                attention_cell_jp)  # [N, T_in, 2*attention_depth=512]
            # 以上复制一份,对应修改为日语特征输入,记新的	concat_cell为concat_cell_jp,新增一行连接两个输出

            # # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            print(type(concat_cell))
            print(concat_cell_jp.output_size)
            encoder_out = tf.concat([concat_cell, concat_cell_jp], axis=-1)

            #connect chinese_outputs and japanese_outputs

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(encoder_out, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.inputs_jp = inputs_jp
            self.input_lengths = input_lengths
            self.input_jp_lengths = input_jp_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')

            #log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  encoder out jp:             %d' %
                encoder_outputs_jp.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  attention out jp:           %d' %
                attention_cell_jp.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  concat attn & out jp:       %d' %
                concat_cell_jp.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):
        is_training = linear_targets is not None  # linear_targets가 초기값(None)이면 False
        self.is_randomly_initialized = is_randomly_initialized  # 초기값 False

        with tf.variable_scope('inference') as scope:  # 'inference'라는 이름으로 묶음
            hp = self._hparams
            batch_size = tf.shape(inputs)[
                0]  # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수)

            # Embeddings
            char_embed_table = tf.get_variable(
                'embedding',
                [len(symbols), hp.embedding_size],
                dtype=tf.
                float32,  # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성
                initializer=tf.truncated_normal_initializer(stddev=0.5)
            )  # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)  # inputs의 인덱스에 따라 char_embed_table값 리턴

            self.num_speakers = num_speakers
            if self.num_speakers > 1:  # 다중화자일때
                if hp.speaker_embedding_size != 1:  # hparams의 speaker_embedding_size값이 1이 아닐때
                    speaker_embed_table = tf.get_variable(  # 공유변수 생성
                        'speaker_embedding',  # 'speaker_embedding'이라는 이름의
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.
                        float32,  # num_speakers와 speaker_embedding_size에 속해있는
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))  # 초기화값 가중치
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id
                    )  # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor)
############################################################## 추가설명 필요
                if hp.model_type == 'deepvoice':  # deepvoice일때
                    if hp.speaker_embedding_size == 1:  # hparams의 speaker_embedding_size값이 1일때
                        before_highway = get_embed(  # def get_embed(inputs, num_inputs, embed_size, name):
                            speaker_id,
                            self.
                            num_speakers,  # speaker_id의 인덱스에 따라 embed_table값 리턴
                            hp.enc_prenet_sizes[-1],
                            "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
##############################################################
                    else:  # hparams의 speaker_embedding_size값이 1이 아닐때
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)
                        # input:x, units:dim, 활성화함수로 softsign사용
                        # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30
                        # tf.layers.dense( inputs, units, activation)
                        # inputs는 앞의 레이어를 정의
                        # units는 이 레이어에 크기를 정의
                        # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수
                        # dense는 히든레이어를 구현하는 함수이다.
                        # https://bcho.tistory.com/1196

                        before_highway = deep_dense(
                            speaker_embed, hp.enc_prenet_sizes[-1]
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128)
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]  # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list

                    speaker_embed = None  # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다.
                elif hp.model_type == 'simple':  # modeltype이 deepvoice가 아니라 simple일때
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None  # 레이어 전부 x
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type)
                    )  # multi-speaker model type이 아니라고 에러메세지 출력
            else:  # 스피커의 수가 1명이면
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None  # 레이어 전부 x
            ##############
            # Encoder (특수문자, 한글 자모음text를 숫자로)
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,  #
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')

            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention (중요!)
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Ejemplo n.º 7
0
    def _init(self):
        sequence = tf.placeholder(tf.int32, [None, None], name='sequence')
        targets = tf.placeholder(tf.int32, [None, None], name='targets')
        authors = tf.placeholder(tf.int32, [None, None], name='authors')

        batch_size = tf.shape(sequence)[0]

        sequence_lengths = tf.cast(tf.count_nonzero(sequence, axis=1),
                                   tf.int32)
        embedding = tf.Variable(
            tf.random_normal((self._vocab_size, self._embed_size)))
        context = tf.Variable(
            tf.random_normal((self._author_size, self._ctx_size)))

        embedded_sequence = tf.nn.embedding_lookup(embedding, sequence)
        embedded_authors = tf.nn.embedding_lookup(context, authors)
        one_hot_targets = tf.one_hot(targets, self._vocab_size)

        gpu = lambda x: str(x % self._num_gpu)

        if self._attn:
            mech = BahdanauAttention(self._attn_depth, embedded_sequence,
                                     sequence_lengths)
            attn_cell = lambda x: DeviceWrapper(
                AttentionWrapper(x, mech, self._attn_size), "/gpu:" + gpu(1))
        else:
            attn_cell = lambda x: x

        if self._training:
            dropout = lambda x: DropoutWrapper(x, 1.0, 1.0 - self._dropout)
        else:
            dropout = lambda x: x

        if self._cell == 'lstm':
            base_cell = lambda x: dropout(BasicLSTMCell(x))
        elif self._cell == 'gru':
            base_cell = lambda x: dropout(GRUCell(x))

        context_cell = ContextWrapper(
            base_cell(self._cell_size),
            embedded_authors,
        )
        #context_cell = base_cell(self._cell_size)
        bottom_cell = DeviceWrapper(attn_cell(context_cell), "/gpu:0")
        top_cells = [
            DeviceWrapper(base_cell(self._cell_size), "/gpu:" + gpu(i))
            for i in range(1, self._cell_num)
        ]
        cell = MultiRNNCell([bottom_cell] + top_cells)

        init_state = cell.zero_state(batch_size, tf.float32)

        if self._training:
            helper = TrainingHelper(embedded_sequence, sequence_lengths)
        else:
            helper = SampleEmbeddingHelper(embedding, sequence[:, 0], 1)

        dense = Dense(self._vocab_size, self._activation)
        decoder = BasicDecoder(cell, helper, init_state, dense)
        output, state, _ = dynamic_decode(decoder, swap_memory=True)
        logits = output.rnn_output

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=one_hot_targets)
        loss = tf.reduce_mean(loss)

        out = tf.nn.softmax(logits)

        return sequence, authors, targets, loss, out
Ejemplo n.º 8
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
    def add_prediction_op(self):
        encoder_embed_seq = embed_sequence(
            self.inputs,
            vocab_size=self.config.vocab_size + 2,
            embed_dim=self.config.embedding_size,
            scope='embed')

        decoder_input_embed_seq = embed_sequence(
            self.labels[:, :-1],
            vocab_size=self.config.vocab_size + 2,
            embed_dim=self.config.embedding_size,
            scope='embed',
            reuse=True)

        with tf.variable_scope('embed', reuse=True):
            embeddings = tf.get_variable('embeddings')

        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            BasicLSTMCell(self.config.num_units, name="encoder"),
            encoder_embed_seq,
            dtype=tf.float32,
            sequence_length=self.lengths,
        )

        if self.config.train:
            tiled_encoder_outputs = encoder_outputs
            tiled_encoder_final_state = encoder_final_state
            tiled_sequence_length = self.lengths
        else:
            tiled_encoder_outputs = tile_batch(
                encoder_outputs, multiplier=self.config.beam_width)
            tiled_encoder_final_state = tile_batch(
                encoder_final_state, multiplier=self.config.beam_width)
            tiled_sequence_length = tile_batch(
                self.lengths, multiplier=self.config.beam_width)

        attention_mechanism = BahdanauAttention(
            num_units=self.config.num_units,
            memory=tiled_encoder_outputs,
            memory_sequence_length=tiled_sequence_length)

        attn_cell = AttentionWrapper(
            BasicLSTMCell(self.config.num_units, name="decoder"),
            attention_mechanism,
            attention_layer_size=self.config.num_units / 2)

        if self.config.train:
            batch_size = self.config.batch_size
        else:
            batch_size = self.config.batch_size * self.config.beam_width

        decoder_initial_state = attn_cell.zero_state(dtype=tf.float32,
                                                     batch_size=batch_size)
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=tiled_encoder_final_state)

        output_layer = tf.layers.Dense(self.config.vocab_size + 2,
                                       use_bias=True,
                                       name='output_projection')

        if self.config.train:
            training_helper = TrainingHelper(inputs=decoder_input_embed_seq,
                                             sequence_length=self.lengths,
                                             name='training_helper')

            decoder = BasicDecoder(cell=attn_cell,
                                   helper=training_helper,
                                   initial_state=decoder_initial_state,
                                   output_layer=output_layer)
        else:

            def embed_and_input_proj(inputs):
                return tf.nn.embedding_lookup(embeddings, inputs)

            start_tokens = tf.ones([
                self.config.batch_size,
            ], tf.int32) * (self.config.vocab_size + 1)
            decoder = BeamSearchDecoder(
                cell=attn_cell,
                embedding=embed_and_input_proj,
                start_tokens=start_tokens,
                end_token=self.config.vocab_size,
                initial_state=decoder_initial_state,
                beam_width=self.config.beam_width,
                output_layer=output_layer,
            )

        if self.config.train:
            decoder_outputs, _, _ = dynamic_decode(
                decoder=decoder,
                impute_finished=True,
                maximum_iterations=self.config.max_sequence_length + 1)
            pred_logits = tf.identity(decoder_outputs.rnn_output,
                                      name="prediction")
        else:
            decoder_outputs, _, _ = dynamic_decode(
                decoder=decoder,
                impute_finished=False,
                maximum_iterations=self.config.max_sequence_length + 1)
            pred_logits = tf.identity(decoder_outputs.predicted_ids,
                                      name="prediction")
        return pred_logits
Ejemplo n.º 10
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   target_lengths,
                   prefixes=None,
                   speaker_ids=None,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # inputs                                                                    # [N, T_in, D_input]
            speaker_embedding_table = tf.get_variable(
                'speaker_embedding_table',
                [hp.num_speakers, hp.speaker_embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            speaker_embedding = tf.nn.embedding_lookup(
                speaker_embedding_table,
                speaker_ids)  # [N, T_in, hp.speaker_embedding_size]
            deep_dense = lambda x, dim: tf.layers.dense(
                x, dim, activation=tf.nn.softsign)
            before_highway = deep_dense(speaker_embedding, 128)
            encoder_rnn_init_state = deep_dense(speaker_embedding, 128 * 2)
            attention_rnn_init_state = deep_dense(speaker_embedding, 256)
            decoder_rnn_init_states = [
                deep_dense(speaker_embedding, 256) for _ in range(2)
            ]
            # Encoder
            prenet_outputs = prenet(inputs, is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state
            )  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)
            # initially, decoder_init_state is a tuple, so we firstly convert it into a list,
            # decoder_init_state[0] is the projection wrapper, its initial state should be zero state
            # finally, convert list state into tuple
            decoder_init_state = list(decoder_init_state)
            for idx, cell in enumerate(decoder_rnn_init_states):
                decoder_init_state[idx + 1] = cell
            decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.speaker_ids = speaker_ids
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.target_lengths = target_lengths
            self.prefixes = prefixes
            log('Initialized Tacotron model. Dimensions: ')
            log('  inputs:                  %d' % inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames): %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Ejemplo n.º 11
0
    def build_decoder_cell(self, encoder_outputs, encoder_states):
        '''

        构建解码器的cell,返回一个解码器的cell和解码器初始化状态。
        :param encoder_outputs:

        :param encoder_state:
        :return:
        '''
        encoder_input_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_states = encoder_states[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        assert encoder_input_length is not None, 'encoder_state_length 不能为空'
        assert isinstance(batch_size, int), 'batchsize的值必须为int类型'
        assert encoder_outputs is not None, 'encoder_outputs is not None'
        assert encoder_states is not None, 'encoder_state is not None'
        #########################使用beamsearch的情况#####################################################
        if self.use_beamsearch_decode:
            '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是
            batch的数据变成了原来的self.beam_width 倍
            '''
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_states = seq2seq.tile_batch(encoder_states,
                                                multiplier=self.beam_width)
            encoder_input_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            #如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size
            batch_size *= self.beam_width
        #########################使用beamsearch的情况#####################################################

        #########################使用注意力机制###########################################################
        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length)
        else:
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_input_length
            )  #双向LSTM的话encoder_outputs 就是它的隐藏状态h1
        #########################使用注意力机制###########################################################

        cell = MultiRNNCell([
            self.build_single_cell(self.hidden_units,
                                   use_residual=self.use_residual)
            for _ in range(self.depth)
        ])
        #这个cell就是多层的。

        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        #alignment_history在不是训练状态以及没有使用beamsearch的时候使用。

        def cell_input_fn(inputs, attention):
            '''
            根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算
            使用注意力机制才会进行的运算
            :param inputs:
            :param attention:
            :return:
            '''

            if not self.use_residual:
                print(inputs.get_shape, 'inputs_shape')
                print(attention.get_shape, 'inputs_shape')
                print(array_ops.concat([inputs, attention], -1),
                      'inputs和attention拼接之后的形状')
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            '''
            这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是
            layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1))
            Dense最终继承了Layer类,Layer中定义了call方法和__call__ 方法,Dense也重写了call方法,__call__方法中调用call方法,call方法中还是起一个全连接层层的作用,__call__
            方法中执行流程是:pre process,call,post process
            '''
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,  #这个是attention的历史信息
            cell_input_fn=cell_input_fn,  #将attention拼接起来和input拼接起来
            name='Attention_Wrapper')  #AttentionWrapper 注意力机制的包裹器

        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32)  #这里初始化decoder_inital_state

        #传递encoder的状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_states)

        return cell, decoder_initial_state
Ejemplo n.º 12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder, prenet_size=[256, 128]
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_size)  # [N, T_in, prenet_size[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_output_size=256]
                output_size=hp.encoder_output_size)

            # Attention_RNN 用target与encoder_output计算attention
            attention_cell = AttentionWrapper(
                # input_size = 128, output_size = 256
                cell=GRUCell(num_units=hp.attention_depth
                             ),  # 输出size=attention_depth=256
                # input_size = output_size = 256
                attention_mechanism=BahdanauAttention(
                    num_units=hp.attention_depth, memory=encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # attention_RNN前加入prenet, prenet_size=[256, 128], prenet_output_size=128
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_size)

            # 将attention context vector和RNN cell output进行拼接
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # DecodeRNN为2层残差RNN (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(
                        cell=concat_cell,
                        output_size=hp.decoder_depth),  # 512 -> 256
                    ResidualWrapper(cell=GRUCell(hp.decoder_depth)),
                    ResidualWrapper(cell=GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (预测outputs_per_step帧):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            # help决定下个时刻的输入和初始输入
            if is_training:
                helper = TacoTrainingHelper(inputs=inputs,
                                            targets=mel_targets,
                                            output_dim=hp.num_mels,
                                            r=hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size=batch_size,
                                        output_dim=hp.num_mels,
                                        r=hp.outputs_per_step)
            # 解码:预测不重叠的帧, 例如r->(r+1,2r), 2r->(2r+1,3r)....
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper,
                              decoder_init_state),  # 打包成解码器
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, num_mels*r ]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M=80]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_output_size=256]
                output_size=hp.postnet_output_size)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F=1025]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Ejemplo n.º 13
0
def attention_decoder(inputs,
                      memory,
                      num_units=None,
                      batch_size=1,
                      inputs_length=None,
                      n_mels=80,
                      reduction=1,
                      default_max_iters=200,
                      is_training=True,
                      scope='attention_decoder',
                      reuse=None):
    """
    Applies a GRU to 'inputs', while attending 'memory'.
    :param inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
    :param memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
    :param num_units: An int. Attention size.
    :param batch_size: An int. Batch size.
    :param inputs_length: An int. Memory length.
    :param n_mels: An int. Number of Mel banks to generate.
    :param reduction: An int. Reduction factor. Paper => 2, 3, 5.
    :param default_max_iters: Default max iteration of decoding.
    :param is_training: running mode.
    :param scope: Optional scope for `variable_scope`.
    :param reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
    :return: A 3d tensor with shape of [N, T, num_units].
    """
    with tf.variable_scope(scope, reuse=reuse):
        # params setting
        if is_training:
            max_iters = None
        else:
            max_iters = default_max_iters
        # max_iters = default_max_iters
        if num_units is None:
            num_units = inputs.get_shape().as_list()[-1]

        # Decoder cell
        decoder_cell = tf.nn.rnn_cell.GRUCell(num_units)

        # Attention
        # [N, T_in, attention_depth]
        attention_cell = AttentionWrapper(decoder_cell,
                                          BahdanauAttention(num_units, memory),
                                          alignment_history=True)

        # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
        # [N, T_in, 2*attention_depth]
        concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

        # Decoder (layers specified bottom to top):
        # [N, T_in, decoder_depth]
        decoder_cell = MultiRNNCell([
            OutputProjectionWrapper(concat_cell, num_units),
            ResidualWrapper(GRUCell(num_units)),
            ResidualWrapper(GRUCell(num_units))
        ],
                                    state_is_tuple=True)

        # Project onto r mel spectrogram (predict r outputs at each RNN step):
        output_cell = OutputProjectionWrapper(decoder_cell, n_mels * reduction)

        decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                    dtype=tf.float32)

        if is_training:
            # helper = TacotronTrainingHelper(batch_size, n_mels, reduction, inputs)
            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=inputs, sequence_length=inputs_length, time_major=False)
        else:
            helper = TacotronInferenceHelper(batch_size, n_mels, reduction)

        decoder = BasicDecoder(output_cell, helper, decoder_init_state)
        # [N, T_out/r, M*r]
        (decoder_outputs, _), final_decoder_state, _ = dynamic_decode(
            decoder, maximum_iterations=max_iters)

    return decoder_outputs, final_decoder_state
Ejemplo n.º 14
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """构建解码器cell"""

        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量
        # encoder_outputs, encoder_state, encoder_inputs_length
        # needs to be tiled so that:
        # [batch_size, .., ..] -> [batch_size x beam_width, .., ..]
        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(
                encoder_outputs, multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(
                encoder_state, multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的
            batch_size *= self.beam_width

        # 下面是两种不同的 Attention 机制
        if self.attention_type.lower() == 'luong':
            # 'Luong' style attention: https://arxiv.org/abs/1508.04025
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length
            )
        else: # Default Bahdanau
            # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length
            )

        # Building decoder_cell
        cell = MultiRNNCell([
            self.build_single_cell(
                self.hidden_units,
                use_residual=self.use_residual
            )
            for _ in range(self.depth)
        ])

        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (
            self.mode != 'train' and not self.use_beamsearch_decode
        )

        def cell_input_fn(inputs, attention):
            """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算
            """
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,
            cell_input_fn=cell_input_fn,
            name='Attention_Wrapper')

        # 空状态
        decoder_initial_state = cell.zero_state(
            batch_size, tf.float32)

        # 传递encoder状态
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)

        # if self.use_beamsearch_decode:
        #     decoder_initial_state = seq2seq.tile_batch(
        #         decoder_initial_state, multiplier=self.beam_width)

        return cell, decoder_initial_state
Ejemplo n.º 15
0
    def _create_decoder_cell(self):
        enc_outputs, enc_states, enc_seq_len = self.enc_outputs, self.enc_states, self.enc_seq_len
        if self.use_beam_search:
            enc_outputs = tile_batch(enc_outputs,
                                     multiplier=self.cfg.beam_size)
            enc_states = nest.map_structure(
                lambda s: tile_batch(s, self.cfg.beam_size), enc_states)
            enc_seq_len = tile_batch(self.enc_seq_len,
                                     multiplier=self.cfg.beam_size)
        batch_size = self.batch_size * self.cfg.beam_size if self.use_beam_search else self.batch_size
        with tf.variable_scope("attention"):
            if self.cfg.attention == "luong":  # Luong attention mechanism
                attention_mechanism = LuongAttention(
                    num_units=self.cfg.num_units,
                    memory=enc_outputs,
                    memory_sequence_length=enc_seq_len)
            else:  # default using Bahdanau attention mechanism
                attention_mechanism = BahdanauAttention(
                    num_units=self.cfg.num_units,
                    memory=enc_outputs,
                    memory_sequence_length=enc_seq_len)

        def cell_input_fn(
            inputs, attention
        ):  # define cell input function to keep input/output dimension same
            # reference: https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/AttentionWrapper
            if not self.cfg.use_attention_input_feeding:
                return inputs
            input_project = tf.layers.Dense(self.cfg.num_units,
                                            dtype=tf.float32,
                                            name='attn_input_feeding')
            return input_project(tf.concat([inputs, attention], axis=-1))

        if self.cfg.top_attention:  # apply attention mechanism only on the top decoder layer
            cells = [
                self._create_rnn_cell() for _ in range(self.cfg.num_layers)
            ]
            cells[-1] = AttentionWrapper(
                cells[-1],
                attention_mechanism=attention_mechanism,
                name="Attention_Wrapper",
                attention_layer_size=self.cfg.num_units,
                initial_cell_state=enc_states[-1],
                cell_input_fn=cell_input_fn)
            initial_state = [state for state in enc_states]
            initial_state[-1] = cells[-1].zero_state(batch_size=batch_size,
                                                     dtype=tf.float32)
            dec_init_states = tuple(initial_state)
            cells = MultiRNNCell(cells)
        else:
            cells = MultiRNNCell(
                [self._create_rnn_cell() for _ in range(self.cfg.num_layers)])
            cells = AttentionWrapper(cells,
                                     attention_mechanism=attention_mechanism,
                                     name="Attention_Wrapper",
                                     attention_layer_size=self.cfg.num_units,
                                     initial_cell_state=enc_states,
                                     cell_input_fn=cell_input_fn)
            dec_init_states = cells.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=enc_states)
        return cells, dec_init_states
Ejemplo n.º 16
0
    def build_decoder_cell(self, encoder_outputs, encoder_state):
        """ 构建解码器cell """
        encoder_inputs_length = self.encoder_inputs_length
        batch_size = self.batch_size

        if self.bidirectional:
            encoder_state = encoder_state[-self.depth:]

        if self.time_major:
            encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2))

        if self.use_beamsearch_decode:
            encoder_outputs = seq2seq.tile_batch(encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_state = seq2seq.tile_batch(encoder_state,
                                               multiplier=self.beam_width)
            encoder_inputs_length = seq2seq.tile_batch(
                self.encoder_inputs_length, multiplier=self.beam_width)
            #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的
            batch_size *= self.beam_width

        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = LuongAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
        else:
            #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。
            self.attention_mechanism = BahdanauAttention(
                num_units=self.hidden_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)

        cell = MultiRNNCell([
            self.build_signle_cell(self.hidden_units,
                                   use_residual=self.use_residual)
            for _ in range(self.depth)
        ])
        # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息
        alignment_history = (self.mode != 'train'
                             and not self.use_beamsearch_decode)

        def cell_input_fn(inputs, attention):
            """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算"""
            if not self.use_residual:
                return array_ops.concat([inputs, attention], -1)

            attn_projection = layers.Dense(self.hidden_units,
                                           dtype=tf.float32,
                                           use_bias=False,
                                           name='attention_cell_input_fn')
            return attn_projection(array_ops.concat([inputs, attention], -1))

        attention_cell = AttentionWrapper(
            cell=cell,
            attention_mechanism=self.attention_mechanism,
            attention_layer_size=self.hidden_units,
            alignment_history=alignment_history,
            cell_input_fn=cell_input_fn,
            name='AttentionWrapper')
        # 空状态
        decoder_initial_state = attention_cell.zero_state(
            batch_size, tf.float32)

        #传递encoder的状态  定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)
        return attention_cell, decoder_initial_state
Ejemplo n.º 17
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

        Sets "pml_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_mechanism = BahdanauAttention(hp.attention_depth,
                                                    encoder_outputs)

            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                attention_mechanism,
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  embedding:               %d' %
                    embedded_inputs.shape[-1])
                log('  prenet out:              %d' % prenet_outputs.shape[-1])
                log('  encoder out:             %d' %
                    encoder_outputs.shape[-1])
                log('  attention out:           %d' %
                    attention_cell.output_size)
                log('  concat attn & out:       %d' % concat_cell.output_size)
                log('  decoder cell out:        %d' % decoder_cell.output_size)
                log('  decoder out (%d frames):  %d' %
                    (hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  decoder out (1 frame):   %d' % pml_outputs.shape[-1])
Ejemplo n.º 18
0
    def initialize(
            self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False,
        ):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        # get_variable() 사용 시, 'inference' scope 안에 있는 변수 가져옴
        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings
            char_embed_table = tf.get_variable(
                    'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                            'speaker_embedding',
                            [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.enc_prenet_sizes[-1], "before_highway")
                        encoder_rnn_init_state = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.enc_rnn_size * 2, "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.attention_state_size, "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers, 
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(
                                speaker_embed, hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                                speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                                speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [deep_dense(
                                speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)]

                    speaker_embed = None # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type))
            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(char_embedded_inputs, is_training,
                    hp.enc_prenet_sizes, hp.dropout_prob,
                    scope='prenet')

            encoder_outputs = cbhg(
                    prenet_outputs, input_lengths, is_training,
                    hp.enc_bank_size, hp.enc_bank_channel_size,
                    hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size,
                    hp.enc_proj_sizes, hp.enc_proj_width,
                    scope="encoder_cbhg",
                    before_highway=before_highway,
                    encoder_rnn_init_state=encoder_rnn_init_state)


            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                    tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                    tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                    GRUCell(hp.attention_state_size),
                    speaker_embed,
                    is_training, hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                        hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                        hp.attention_size, encoder_outputs, scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(
                        hp.attention_size, encoder_outputs, shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type))

            attention_cell = AttentionWrapper(
                    dec_prenet_outputs,
                    attention_mechanism,
                    self.is_manual_attention,
                    self.manual_alignments,
                    initial_cell_state=attention_rnn_init_state,
                    alignment_history=True,
                    output_attention=False
            )

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                    attention_cell, embed_to_concat=speaker_embed)
                        
            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                    decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(
                        inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                        rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(
                        batch_size, hp.num_mels, hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(
                    decoder_outputs, [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(
                    mel_outputs, None, is_training,
                    hp.post_bank_size, hp.post_bank_channel_size,
                    hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size,
                    hp.post_proj_sizes, hp.post_proj_width,
                    scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                        expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)    # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(), [1, 2, 0])


            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('='*40)
            log(' model_type: %s' % hp.model_type)
            log('='*40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' % char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' % speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' % attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' % (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Ejemplo n.º 19
0
    def __init__(self,
                 vocab_size,
                 positional_embeddings=False,
                 beam_width=1,
                 alignment_history=False):
        """
        Initialize global variables and compute graph
        """
        # vocabulary parameters
        # input image
        self.beam_width = beam_width
        self.attention_mode = 0
        self.vocab_size = vocab_size
        self.learning_rate = tf.placeholder(tf.float32)
        self.input_image = tf.placeholder(tf.float32,
                                          shape=(None, 46, None, 1),
                                          name='img_data')
        self.batch_size = tf.shape(self.input_image)[0]

        # attention part placeholder
        self.att_label = tf.placeholder(tf.int32,
                                        shape=[None, None],
                                        name='att_label')
        self.att_train_length = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='att_train_length')
        # self.eight = tf.constant(8, dtype=tf.int32)
        # ctc part placeholder
        self.ctc_label = tf.sparse_placeholder(tf.int32, name='ctc_label')
        self.ctc_feature_length = tf.placeholder(tf.int32,
                                                 shape=[None],
                                                 name='ctc_feature_length')
        self.max_dec_iteration = tf.placeholder(tf.int32, shape=[1])
        self.enc_lstm_dim = 256
        self.dec_lstm_dim = 512
        self.embedding_size = 512
        self.ctc_loss_weights = 0.2
        self.att_loss_weights = 1 - self.ctc_loss_weights
        self.wd = 0.00002
        self.momentum = 0.9
        self.embedding = tf.get_variable(
            "embedding", [self.vocab_size, self.embedding_size])

        self.cnn_out, self.sequence_len = convnet_layers(
            self.input_image, self.ctc_feature_length, mode)
        self.enc_outputs = rnn_layers(self.cnn_out, self.sequence_len,
                                      self.enc_lstm_dim)

        attention_weights_depth = 2 * self.enc_lstm_dim
        attention_layer_size = 2 * self.enc_lstm_dim
        attention_states = tf.reshape(
            self.enc_outputs, [self.batch_size, -1, 2 * self.enc_lstm_dim])
        attention_states_tiled = tile_batch(
            attention_states, self.beam_width)  # For generalization

        attention_mechanism = BahdanauAttention(attention_weights_depth,
                                                attention_states_tiled)

        dec_lstm_cell = tf.nn.rnn_cell.LSTMCell(self.dec_lstm_dim)
        self.cell = AttentionWrapper(cell=dec_lstm_cell,
                                     attention_mechanism=attention_mechanism,
                                     attention_layer_size=attention_layer_size,
                                     alignment_history=alignment_history)
        self.setup_decoder()
        self.final_outputs, self.final_state, _ = dynamic_decode(
            self.decoder, maximum_iterations=self.max_dec_iteration[0] - 1)
        self.ctc_loss_branch()
        self.finalize_model()
Ejemplo n.º 20
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, 256),
                    #ResidualWrapper(GRUCell(256)),
                    #ResidualWrapper(GRUCell(256))
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
                ],
                state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Ejemplo n.º 21
0
    def initialize(self, txt_targets, txt_lengths, mel_targets, image_targets):
        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings for text
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_txt_inputs = tf.nn.embedding_lookup(
                embedding_table, txt_targets)  # [N, T_in, 256]

            # Text Encoder
            prenet_outputs = prenet(embedded_txt_inputs,
                                    is_training)  # [N, T_in, 128]
            txt_encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                               is_training)  # [N, T_in, 256]
            self.z_txt

            # Speech Encoder
            speech_outputs = reference_encoder(
                mel_targets,
                filters=hp.reference_filters,
                kernel_size=(3, 3),
                strides=(2, 2),
                encoder_cell=GRUCell(hp.reference_depth),
                is_training=is_training)  # [N, 256]
            self.z_speech = speech_outputs

            # Image Encoder
            img_outputs = image_encoder('E',
                                        is_training=is_training,
                                        norm='batch',
                                        image_size=128)
            self.z_img = img_outputs

            def global_body(self, input):
                # Global computing body (share weights)
                # information fusion encoder
                self.z_fuse = info_encoder(input)  # [N, 1, 256]
                # Global  tokens (GST)
                gst_tokens = tf.get_variable(
                    'global_tokens',
                    [hp.num_gst, hp.embed_depth // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

                # Attention
                attention = MultiheadAttention(
                    tf.expand_dims(z_fuse, axis=1),  # [N, 1, 256]
                    tf.tanh(
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                    num_heads=hp.num_heads,
                    num_units=hp.style_att_dim,
                    attention_type=hp.style_att_type)

                output = attention.multi_head_attention()  # [N, 1, 256]
                self.uni_embedding = output
                return self.uni_embedding

            # Domain classification network
            domain_logit_txt = domain_classifier('D',
                                                 is_training=is_training,
                                                 norm='batch',
                                                 info_encoder(self.z_txt))

            domain_logit_img = domain_classifier('D',
                                                 is_training=is_training,
                                                 norm='batch',
                                                 info_encoder(self.z_img))

            domain_logit_speech = domain_classifier('D',
                                                    is_training=is_training,
                                                    norm='batch',
                                                    info_encoder(
                                                        self.z_speech))

        # out of inference scope
        # Add style embedding to every text encoder state

        # Text Decoder scope
        with tf.variable_scope('text_decoder') as scope:

            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  uni_embeddings,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            output_cell = OutputProjectionWrapper(decoder_cell,
                                                  hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            decoder_outputs, _ = tf.nn.dynamic_rnn(
                cell=output_cell,
                initial_state=decoder_init_state,
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]
            with tf.variable_scope('text_logits') as scope:
                txt_logit = tf.contrib.layers.fully_connected(
                    inputs=decoder_outputs,
                    num_outputs=self.config.vocab_size,
                    activation_fn=None,
                    weights_initializer=self.initializer,
                    scope=logits_scope)

        # Image Decoder scope
        with tf.variable_scope('image_decoder') as scope:
            G = Generator('G',
                          is_train=self.is_training,
                          norm='batch',
                          image_size=128)
            fake_img = G(uni_embeddings)

        # Speech Decoder scope
        with tf.variable_scope('speech_decoder') as scope:
            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  uni_embeddings,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            fake_mel = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

        self.txt_targets = txt_targets
        self.txt_lengths = txt_lengths
        self.mel_targets = mel_targets
        self.image_targets = image_targets
        self.txt_targets = txt_targets
        self.txt_logit = txt_logit
        self.fake_mel = fake_mel
        self.fake_img = fake_img