def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Example #2
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id,
                   mel_targets=None,
                   linear_targets=None,
                   loss_coeff=None,
                   rnn_decoder_test_mode=False,
                   is_randomly_initialized=False):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly

                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None

                else:
                    raise Exception(
                        " [!] Unknown multi-speaker model type: {}".format(
                            hp.model_type))

            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.dec_rnn_size),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            # [N, T_out, postnet_depth=256]
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Example #3
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id=None,
                   mel_targets=None,
                   linear_targets=None,
                   is_training=False,
                   loss_coeff=None,
                   stop_token_targets=None):

        with tf.variable_scope('Eembedding') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                speaker_embed_table = tf.get_variable(
                    'speaker_embedding',
                    [self.num_speakers, hp.speaker_embedding_size],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                # [N, T_in, speaker_embedding_size]
                speaker_embed = tf.nn.embedding_lookup(speaker_embed_table,
                                                       speaker_id)

                deep_dense = lambda x, dim, name: tf.layers.dense(
                    x, dim, activation=tf.nn.softsign, name=name
                )  # softsign: x / (abs(x) + 1)

                encoder_rnn_init_state = deep_dense(
                    speaker_embed, hp.encoder_lstm_units * 4,
                    'encoder_init_dense')  # hp.encoder_lstm_units = 256

                decoder_rnn_init_states = [
                    deep_dense(speaker_embed, hp.decoder_lstm_units * 2,
                               'decoder_init_dense_{}'.format(i))
                    for i in range(hp.decoder_layers)
                ]  # hp.decoder_lstm_units = 1024

                speaker_embed = None
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

        with tf.variable_scope('Encoder') as scope:
            ##############
            # Encoder
            ##############
            x = char_embedded_inputs
            for i in range(hp.enc_conv_num_layers):
                x = tf.layers.conv1d(x,
                                     filters=hp.enc_conv_channels,
                                     kernel_size=hp.enc_conv_kernel_size,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='dropout_{}'.format(i))

            if encoder_rnn_init_state is not None:
                initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split(
                    encoder_rnn_init_state, 4, 1)
                initial_state_fw = LSTMStateTuple(initial_state_fw_c,
                                                  initial_state_fw_h)
                initial_state_bw = LSTMStateTuple(initial_state_bw_c,
                                                  initial_state_bw_h)
            else:  # single mode
                initial_state_fw, initial_state_bw = None, None

            cell_fw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            ##############
            # Attention
            ##############
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=hp.mask_encoder,
                    memory_sequence_length=input_lengths,
                    smoothing=hp.smoothing,
                    cumulate_weights=hp.cumulative_weights)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            decoder_lstm = [
                ZoneoutLSTMCell(hp.decoder_lstm_units,
                                is_training,
                                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                                zoneout_factor_output=hp.tacotron_zoneout_rate,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(hp.decoder_layers)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "multi-speaker":

                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx][0].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1[1] * 2 != shape2[1]:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    c, h = tf.split(cell, 2, 1)
                    decoder_init_state[idx] = LSTMStateTuple(c, h)

                decoder_init_state = tuple(decoder_init_state)

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_prenet_outputs = DecoderWrapper(attention_cell, is_training,
                                                hp.dec_prenet_sizes,
                                                hp.dropout_prob,
                                                hp.inference_prenet_dropout)

            dec_outputs_cell = OutputProjectionWrapper(
                dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor)

            if is_training:
                helper = TacoTrainingHelper(
                    mel_targets, hp.num_mels,
                    hp.reduction_factor)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor))  # max_iters=200

            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor],
                [batch_size, -1, hp.num_mels
                 ])  # [N,iters,400] -> [N,5*iters,80]
            stop_token_outputs = tf.reshape(
                decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:],
                [batch_size, -1])  # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(hp.postnet_num_layers):
                activation = tf.nn.tanh if i != (hp.postnet_num_layers -
                                                 1) else None
                x = tf.layers.conv1d(x,
                                     filters=hp.postnet_channels,
                                     kernel_size=hp.postnet_kernel_size,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq,
                name='linear_spectogram_projection')  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state
            self.stop_token_targets = stop_token_targets
            self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            log('    encoder conv out:               %d' %
                encoder_conv_output.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    decoder prenet lstm concat out :        %d' %
                dec_prenet_outputs.output_size)
            log('    decoder cell out:         %d' %
                dec_outputs_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder mel out:    %d' % decoder_mel_outputs.shape[-1])
            log('    mel out:    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Example #4
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   stop_token_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

        with tf.variable_scope('Encoder') as scope:

            x = embedded_inputs

            #3 Conv Layers
            for i in range(3):
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='dropout_{}'.format(i))
            encoder_conv_output = x

            #bi-directional LSTM
            cell_fw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(256,
                                      is_training,
                                      zoneout_factor_cell=0.1,
                                      zoneout_factor_output=0.1,
                                      name='encoder_bw_LSTM')

            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            if hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    128,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=True,
                    memory_sequence_length=input_lengths,
                    smoothing=False,
                    cumulate_weights=True)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    128,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'step_bah':
                attention_mechanism = BahdanauStepwiseMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    mode="parallel")
            elif hp.attention_type == 'mon_bah':
                attention_mechanism = BahdanauMonotonicAttention(
                    128,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loung':
                attention_mechanism = LuongAttention(
                    128, encoder_outputs, memory_sequence_length=input_lengths)

            # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
            #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
            #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)

            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음

            attention_cell = AttentionWrapper(decoder_lstm,
                                              attention_mechanism,
                                              alignment_history=True,
                                              output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_outputs = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                dec_outputs, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]
            #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training,
                                     hp.postnet_depth)
            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            #self.stop_token_targets = stop_token_targets
            #self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            #log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Example #5
0
    def Tensor_Generate(self):
        placeholder_Dict = self.pattern_Feeder.placeholder_Dict

        with tf.variable_scope('encoder') as scope:
            batch_Size = tf.shape(placeholder_Dict["Token"])[0]

            token_Embedding = tf.get_variable(
                name="token_Embedding",
                shape=(encoder_Parameters.number_of_Token,
                       encoder_Parameters.token_Embedding_Size),
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            embedded_Input_Pattern = tf.nn.embedding_lookup(
                token_Embedding, placeholder_Dict["Token"])
            #Shape: [batch_Size, token_Length, embedded_Pattern_Size];

            encoder_Activation = Encoder(
                input_Pattern=embedded_Input_Pattern,
                input_Length=placeholder_Dict["Token_Length"],
                is_Training=placeholder_Dict["Is_Training"],
                scope="encoder_Module")

        with tf.variable_scope('attention') as scope:
            attention_Mechanism = BahdanauMonotonicAttention(
                num_units=attention_Parameters.attention_Size,
                memory=encoder_Activation,
                normalize=True,
                name="bahdanau_Monotonic_Attention")

        with tf.variable_scope('decoder') as scope:
            linear_Projection_Activation, stop_Token, alignment_Histroy = Decoder(
                batch_Size=batch_Size,
                attention_Mechanism=attention_Mechanism,
                is_Training=placeholder_Dict["Is_Training"],
                target_Pattern=placeholder_Dict["Mel_Spectrogram"],
                scope="decoder_Module")
            post_Net_Activation = PostNet(
                input_Pattern=linear_Projection_Activation,
                conv_Filter_Count_and_Kernal_Size_List=[
                    (decoder_Parameters.post_Net_Conv_Filter_Count,
                     decoder_Parameters.post_Net_Conv_Kernal_Size)
                ] * decoder_Parameters.post_Net_Conv_Layer_Count,
                is_Training=placeholder_Dict["Is_Training"],
                scope="post_Net")

            mel_Spectrogram_Activation = linear_Projection_Activation + post_Net_Activation

            #Wavenet is here in Tacotron2, but now I use the Tacotron1's method(CBHG).
            post_CBHG_Activation = CBHG(
                input_Pattern=mel_Spectrogram_Activation,
                input_Length=None,
                scope="post_CBHG",
                is_Training=placeholder_Dict["Is_Training"],
                conv_Bank_Filter_Count=256,
                conv_Bank_Max_Kernal_Size=8,
                max_Pooling_Size=2,
                conv_Projection_Filter_Count_and_Kernal_Size_List=[(256, 3),
                                                                   (80, 3)],
                highway_Layer_Count=4,
                gru_Cell_Size=128)

            spectrogram_Activation = tf.layers.dense(
                post_CBHG_Activation,
                pattern_Parameters.spectrogram_Dimension,
                name="spectrogram")

        with tf.variable_scope('training_Loss') as scope:
            #Mel-spectrogram loss
            mel_Loss1 = tf.reduce_mean(
                tf.pow(
                    placeholder_Dict["Mel_Spectrogram"] -
                    linear_Projection_Activation, 2))
            mel_Loss2 = tf.reduce_mean(
                tf.pow(
                    placeholder_Dict["Mel_Spectrogram"] -
                    mel_Spectrogram_Activation, 2))

            #Stop token loss
            tiled_Range = tf.cast(
                tf.tile(tf.expand_dims(tf.range(tf.shape(stop_Token)[1]),
                                       axis=0),
                        multiples=[batch_Size, 1]), tf.float32)
            tiled_Spectrogram_Length = tf.cast(
                tf.tile(tf.expand_dims(
                    placeholder_Dict["Mel_Spectrogram_Length"] - 1, axis=1),
                        multiples=[1, tf.shape(stop_Token)[1]]), tf.float32)
            stop_Target = tf.clip_by_value(tf.sign(tiled_Range -
                                                   tiled_Spectrogram_Length),
                                           clip_value_min=0,
                                           clip_value_max=1)
            stop_Token_Loss = tf.reduce_mean(
                tf.pow(stop_Target - stop_Token, 2))

            #Spectrogram loss. It is only for Tacotron1 method.
            l1 = tf.abs(placeholder_Dict["Spectrogram"] -
                        spectrogram_Activation)

            if training_Loss_Parameters.priority_Frequencies is None:
                linear_Loss = tf.reduce_mean(l1)
            else:
                lower_Priority_Frequency_Cut, upper_Priority_Frequency_Cut = training_Loss_Parameters.priority_Frequencies
                lower_Priority_Frequency = int(
                    lower_Priority_Frequency_Cut /
                    (sound_Parameters.sample_Rate * 0.5) *
                    sound_Parameters.spectrogram_Dimension)
                upper_Priority_Frequency = int(
                    upper_Priority_Frequency_Cut /
                    (sound_Parameters.sample_Rate * 0.5) *
                    sound_Parameters.spectrogram_Dimension)
                l1_Priority = l1[:, :, lower_Priority_Frequency:
                                 upper_Priority_Frequency]
                linear_Loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(
                    l1_Priority)

            loss = mel_Loss1 + mel_Loss2 + stop_Token_Loss + linear_Loss

            #Optimize
            global_Step = tf.Variable(0, name='global_Step', trainable=False)

            if training_Loss_Parameters.decay_Type.lower() == "noam":
                step = tf.cast(global_Step + 1, dtype=tf.float32)
                warmup_Steps = 4000.0
                learning_Rate = training_Loss_Parameters.initial_Learning_Rate * warmup_Steps**0.5 * tf.minimum(
                    step * warmup_Steps**-1.5, step**-0.5)
            elif self.learning_Rate_Decay_Type.lower() == "exponential":
                learning_Rate = training_Loss_Parameters.initial_Learning_Rate * tf.train.exponential_decay(
                    1., global_Step, 3000, 0.95)
            elif self.learning_Rate_Decay_Type.lower() == "static":
                learning_Rate = tf.convert_to_tensor(
                    training_Loss_Parameters.initial_Learning_Rate,
                    dtype=tf.float32)
            else:
                raise Exception("Unsupported learning rate decay type")

            optimizer = tf.train.AdamOptimizer(learning_Rate)
            #The return value of 'optimizer.compute_gradients' is a list of tuples which is (gradient, variable).
            #Using * is making two seprate lists: (gradient1, gradient2, ...), (variable1, variable2)
            gradients, variables = zip(*optimizer.compute_gradients(loss))
            clipped_Gradients, global_Norm = tf.clip_by_global_norm(
                gradients, 1.0)

            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
            # https://github.com/tensorflow/tensorflow/issues/1122
            # https://www.tensorflow.org/api_docs/python/tf/layers/batch_normalization
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                optimize = optimizer.apply_gradients(zip(
                    clipped_Gradients, variables),
                                                     global_step=global_Step)

        with tf.variable_scope('test_Inference') as scope:
            inverted_Signal = inv_spectrogram_tensorflow(
                spectrogram=spectrogram_Activation,
                num_freq=pattern_Parameters.spectrogram_Dimension,
                frame_shift_ms=sound_Parameters.frame_Shift,
                frame_length_ms=sound_Parameters.frame_Length,
                sample_rate=sound_Parameters.sample_Rate)

            alignment = tf.transpose(alignment_Histroy, [1, 0, 2])
            #Shape: (batch_Size, max_Token, (max_Spectrogram / output_Size_per_Step))
            transposed_Spectrogram = tf.transpose(spectrogram_Activation,
                                                  [0, 2, 1])
            transposed_Mel_Spectrogram = tf.transpose(
                mel_Spectrogram_Activation, [0, 2, 1])

        self.training_Tensor_List = [
            global_Step, learning_Rate, loss, optimize
        ]
        self.test_Tensor_List = [
            global_Step, learning_Rate, inverted_Signal, alignment,
            transposed_Spectrogram, transposed_Mel_Spectrogram
        ]

        if not os.path.exists(self.extract_Dir + "/Summary"):
            os.makedirs(self.extract_Dir + "/Summary")
        graph_Writer = tf.summary.FileWriter(self.extract_Dir + "/Summary",
                                             self.tf_Session.graph)
        graph_Writer.close()
        self.tf_Session.run(tf.global_variables_initializer())
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):
        is_training = linear_targets is not None  # linear_targets가 초기값(None)이면 False
        self.is_randomly_initialized = is_randomly_initialized  # 초기값 False

        with tf.variable_scope('inference') as scope:  # 'inference'라는 이름으로 묶음
            hp = self._hparams
            batch_size = tf.shape(inputs)[
                0]  # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수)

            # Embeddings
            char_embed_table = tf.get_variable(
                'embedding',
                [len(symbols), hp.embedding_size],
                dtype=tf.
                float32,  # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성
                initializer=tf.truncated_normal_initializer(stddev=0.5)
            )  # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)  # inputs의 인덱스에 따라 char_embed_table값 리턴

            self.num_speakers = num_speakers
            if self.num_speakers > 1:  # 다중화자일때
                if hp.speaker_embedding_size != 1:  # hparams의 speaker_embedding_size값이 1이 아닐때
                    speaker_embed_table = tf.get_variable(  # 공유변수 생성
                        'speaker_embedding',  # 'speaker_embedding'이라는 이름의
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.
                        float32,  # num_speakers와 speaker_embedding_size에 속해있는
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))  # 초기화값 가중치
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id
                    )  # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor)
############################################################## 추가설명 필요
                if hp.model_type == 'deepvoice':  # deepvoice일때
                    if hp.speaker_embedding_size == 1:  # hparams의 speaker_embedding_size값이 1일때
                        before_highway = get_embed(  # def get_embed(inputs, num_inputs, embed_size, name):
                            speaker_id,
                            self.
                            num_speakers,  # speaker_id의 인덱스에 따라 embed_table값 리턴
                            hp.enc_prenet_sizes[-1],
                            "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
##############################################################
                    else:  # hparams의 speaker_embedding_size값이 1이 아닐때
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)
                        # input:x, units:dim, 활성화함수로 softsign사용
                        # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30
                        # tf.layers.dense( inputs, units, activation)
                        # inputs는 앞의 레이어를 정의
                        # units는 이 레이어에 크기를 정의
                        # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수
                        # dense는 히든레이어를 구현하는 함수이다.
                        # https://bcho.tistory.com/1196

                        before_highway = deep_dense(
                            speaker_embed, hp.enc_prenet_sizes[-1]
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128)
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]  # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list

                    speaker_embed = None  # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다.
                elif hp.model_type == 'simple':  # modeltype이 deepvoice가 아니라 simple일때
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None  # 레이어 전부 x
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type)
                    )  # multi-speaker model type이 아니라고 에러메세지 출력
            else:  # 스피커의 수가 1명이면
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None  # 레이어 전부 x
            ##############
            # Encoder (특수문자, 한글 자모음text를 숫자로)
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,  #
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')

            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention (중요!)
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])