def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unknown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') # [N, T_in, prenet_depths[-1]=128] encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.dec_rnn_size), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: # [N, T_out, postnet_depth=256] post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id=None, mel_targets=None, linear_targets=None, is_training=False, loss_coeff=None, stop_token_targets=None): with tf.variable_scope('Eembedding') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) deep_dense = lambda x, dim, name: tf.layers.dense( x, dim, activation=tf.nn.softsign, name=name ) # softsign: x / (abs(x) + 1) encoder_rnn_init_state = deep_dense( speaker_embed, hp.encoder_lstm_units * 4, 'encoder_init_dense') # hp.encoder_lstm_units = 256 decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.decoder_lstm_units * 2, 'decoder_init_dense_{}'.format(i)) for i in range(hp.decoder_layers) ] # hp.decoder_lstm_units = 1024 speaker_embed = None else: # self.num_speakers =1인 경우 speaker_embed = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None with tf.variable_scope('Encoder') as scope: ############## # Encoder ############## x = char_embedded_inputs for i in range(hp.enc_conv_num_layers): x = tf.layers.conv1d(x, filters=hp.enc_conv_channels, kernel_size=hp.enc_conv_kernel_size, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='dropout_{}'.format(i)) if encoder_rnn_init_state is not None: initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split( encoder_rnn_init_state, 4, 1) initial_state_fw = LSTMStateTuple(initial_state_fw_c, initial_state_fw_h) initial_state_bw = LSTMStateTuple(initial_state_bw_c, initial_state_bw_h) else: # single mode initial_state_fw, initial_state_bw = None, None cell_fw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: ############## # Attention ############## if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) decoder_lstm = [ ZoneoutLSTMCell(hp.decoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='decoder_LSTM_{}'.format(i + 1)) for i in range(hp.decoder_layers) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "multi-speaker": decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx][0].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1[1] * 2 != shape2[1]: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) c, h = tf.split(cell, 2, 1) decoder_init_state[idx] = LSTMStateTuple(c, h) decoder_init_state = tuple(decoder_init_state) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_prenet_outputs = DecoderWrapper(attention_cell, is_training, hp.dec_prenet_sizes, hp.dropout_prob, hp.inference_prenet_dropout) dec_outputs_cell = OutputProjectionWrapper( dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor) if is_training: helper = TacoTrainingHelper( mel_targets, hp.num_mels, hp.reduction_factor) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor)) # max_iters=200 decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor], [batch_size, -1, hp.num_mels ]) # [N,iters,400] -> [N,5*iters,80] stop_token_outputs = tf.reshape( decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(hp.postnet_num_layers): activation = tf.nn.tanh if i != (hp.postnet_num_layers - 1) else None x = tf.layers.conv1d(x, filters=hp.postnet_channels, kernel_size=hp.postnet_kernel_size, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') linear_outputs = tf.layers.dense( post_outputs, hp.num_freq, name='linear_spectogram_projection') # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state self.stop_token_targets = stop_token_targets self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) log(' encoder conv out: %d' % encoder_conv_output.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' decoder prenet lstm concat out : %d' % dec_prenet_outputs.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder mel out: %d' % decoder_mel_outputs.shape[-1]) log(' mel out: %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] with tf.variable_scope('Encoder') as scope: x = embedded_inputs #3 Conv Layers for i in range(3): x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) encoder_conv_output = x #bi-directional LSTM cell_fw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM') outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: if hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( 128, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'step_bah': attention_mechanism = BahdanauStepwiseMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, mode="parallel") elif hp.attention_type == 'mon_bah': attention_mechanism = BahdanauMonotonicAttention( 128, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loung': attention_mechanism = LuongAttention( 128, encoder_outputs, memory_sequence_length=input_lengths) # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence. #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( dec_outputs, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets #self.stop_token_targets = stop_token_targets #self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) #log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def Tensor_Generate(self): placeholder_Dict = self.pattern_Feeder.placeholder_Dict with tf.variable_scope('encoder') as scope: batch_Size = tf.shape(placeholder_Dict["Token"])[0] token_Embedding = tf.get_variable( name="token_Embedding", shape=(encoder_Parameters.number_of_Token, encoder_Parameters.token_Embedding_Size), dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_Input_Pattern = tf.nn.embedding_lookup( token_Embedding, placeholder_Dict["Token"]) #Shape: [batch_Size, token_Length, embedded_Pattern_Size]; encoder_Activation = Encoder( input_Pattern=embedded_Input_Pattern, input_Length=placeholder_Dict["Token_Length"], is_Training=placeholder_Dict["Is_Training"], scope="encoder_Module") with tf.variable_scope('attention') as scope: attention_Mechanism = BahdanauMonotonicAttention( num_units=attention_Parameters.attention_Size, memory=encoder_Activation, normalize=True, name="bahdanau_Monotonic_Attention") with tf.variable_scope('decoder') as scope: linear_Projection_Activation, stop_Token, alignment_Histroy = Decoder( batch_Size=batch_Size, attention_Mechanism=attention_Mechanism, is_Training=placeholder_Dict["Is_Training"], target_Pattern=placeholder_Dict["Mel_Spectrogram"], scope="decoder_Module") post_Net_Activation = PostNet( input_Pattern=linear_Projection_Activation, conv_Filter_Count_and_Kernal_Size_List=[ (decoder_Parameters.post_Net_Conv_Filter_Count, decoder_Parameters.post_Net_Conv_Kernal_Size) ] * decoder_Parameters.post_Net_Conv_Layer_Count, is_Training=placeholder_Dict["Is_Training"], scope="post_Net") mel_Spectrogram_Activation = linear_Projection_Activation + post_Net_Activation #Wavenet is here in Tacotron2, but now I use the Tacotron1's method(CBHG). post_CBHG_Activation = CBHG( input_Pattern=mel_Spectrogram_Activation, input_Length=None, scope="post_CBHG", is_Training=placeholder_Dict["Is_Training"], conv_Bank_Filter_Count=256, conv_Bank_Max_Kernal_Size=8, max_Pooling_Size=2, conv_Projection_Filter_Count_and_Kernal_Size_List=[(256, 3), (80, 3)], highway_Layer_Count=4, gru_Cell_Size=128) spectrogram_Activation = tf.layers.dense( post_CBHG_Activation, pattern_Parameters.spectrogram_Dimension, name="spectrogram") with tf.variable_scope('training_Loss') as scope: #Mel-spectrogram loss mel_Loss1 = tf.reduce_mean( tf.pow( placeholder_Dict["Mel_Spectrogram"] - linear_Projection_Activation, 2)) mel_Loss2 = tf.reduce_mean( tf.pow( placeholder_Dict["Mel_Spectrogram"] - mel_Spectrogram_Activation, 2)) #Stop token loss tiled_Range = tf.cast( tf.tile(tf.expand_dims(tf.range(tf.shape(stop_Token)[1]), axis=0), multiples=[batch_Size, 1]), tf.float32) tiled_Spectrogram_Length = tf.cast( tf.tile(tf.expand_dims( placeholder_Dict["Mel_Spectrogram_Length"] - 1, axis=1), multiples=[1, tf.shape(stop_Token)[1]]), tf.float32) stop_Target = tf.clip_by_value(tf.sign(tiled_Range - tiled_Spectrogram_Length), clip_value_min=0, clip_value_max=1) stop_Token_Loss = tf.reduce_mean( tf.pow(stop_Target - stop_Token, 2)) #Spectrogram loss. It is only for Tacotron1 method. l1 = tf.abs(placeholder_Dict["Spectrogram"] - spectrogram_Activation) if training_Loss_Parameters.priority_Frequencies is None: linear_Loss = tf.reduce_mean(l1) else: lower_Priority_Frequency_Cut, upper_Priority_Frequency_Cut = training_Loss_Parameters.priority_Frequencies lower_Priority_Frequency = int( lower_Priority_Frequency_Cut / (sound_Parameters.sample_Rate * 0.5) * sound_Parameters.spectrogram_Dimension) upper_Priority_Frequency = int( upper_Priority_Frequency_Cut / (sound_Parameters.sample_Rate * 0.5) * sound_Parameters.spectrogram_Dimension) l1_Priority = l1[:, :, lower_Priority_Frequency: upper_Priority_Frequency] linear_Loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean( l1_Priority) loss = mel_Loss1 + mel_Loss2 + stop_Token_Loss + linear_Loss #Optimize global_Step = tf.Variable(0, name='global_Step', trainable=False) if training_Loss_Parameters.decay_Type.lower() == "noam": step = tf.cast(global_Step + 1, dtype=tf.float32) warmup_Steps = 4000.0 learning_Rate = training_Loss_Parameters.initial_Learning_Rate * warmup_Steps**0.5 * tf.minimum( step * warmup_Steps**-1.5, step**-0.5) elif self.learning_Rate_Decay_Type.lower() == "exponential": learning_Rate = training_Loss_Parameters.initial_Learning_Rate * tf.train.exponential_decay( 1., global_Step, 3000, 0.95) elif self.learning_Rate_Decay_Type.lower() == "static": learning_Rate = tf.convert_to_tensor( training_Loss_Parameters.initial_Learning_Rate, dtype=tf.float32) else: raise Exception("Unsupported learning rate decay type") optimizer = tf.train.AdamOptimizer(learning_Rate) #The return value of 'optimizer.compute_gradients' is a list of tuples which is (gradient, variable). #Using * is making two seprate lists: (gradient1, gradient2, ...), (variable1, variable2) gradients, variables = zip(*optimizer.compute_gradients(loss)) clipped_Gradients, global_Norm = tf.clip_by_global_norm( gradients, 1.0) # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 # https://www.tensorflow.org/api_docs/python/tf/layers/batch_normalization with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): optimize = optimizer.apply_gradients(zip( clipped_Gradients, variables), global_step=global_Step) with tf.variable_scope('test_Inference') as scope: inverted_Signal = inv_spectrogram_tensorflow( spectrogram=spectrogram_Activation, num_freq=pattern_Parameters.spectrogram_Dimension, frame_shift_ms=sound_Parameters.frame_Shift, frame_length_ms=sound_Parameters.frame_Length, sample_rate=sound_Parameters.sample_Rate) alignment = tf.transpose(alignment_Histroy, [1, 0, 2]) #Shape: (batch_Size, max_Token, (max_Spectrogram / output_Size_per_Step)) transposed_Spectrogram = tf.transpose(spectrogram_Activation, [0, 2, 1]) transposed_Mel_Spectrogram = tf.transpose( mel_Spectrogram_Activation, [0, 2, 1]) self.training_Tensor_List = [ global_Step, learning_Rate, loss, optimize ] self.test_Tensor_List = [ global_Step, learning_Rate, inverted_Signal, alignment, transposed_Spectrogram, transposed_Mel_Spectrogram ] if not os.path.exists(self.extract_Dir + "/Summary"): os.makedirs(self.extract_Dir + "/Summary") graph_Writer = tf.summary.FileWriter(self.extract_Dir + "/Summary", self.tf_Session.graph) graph_Writer.close() self.tf_Session.run(tf.global_variables_initializer())
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None # linear_targets가 초기값(None)이면 False self.is_randomly_initialized = is_randomly_initialized # 초기값 False with tf.variable_scope('inference') as scope: # 'inference'라는 이름으로 묶음 hp = self._hparams batch_size = tf.shape(inputs)[ 0] # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수) # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf. float32, # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성 initializer=tf.truncated_normal_initializer(stddev=0.5) ) # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입 # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) # inputs의 인덱스에 따라 char_embed_table값 리턴 self.num_speakers = num_speakers if self.num_speakers > 1: # 다중화자일때 if hp.speaker_embedding_size != 1: # hparams의 speaker_embedding_size값이 1이 아닐때 speaker_embed_table = tf.get_variable( # 공유변수 생성 'speaker_embedding', # 'speaker_embedding'이라는 이름의 [self.num_speakers, hp.speaker_embedding_size], dtype=tf. float32, # num_speakers와 speaker_embedding_size에 속해있는 initializer=tf.truncated_normal_initializer( stddev=0.5)) # 초기화값 가중치 # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id ) # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor) ############################################################## 추가설명 필요 if hp.model_type == 'deepvoice': # deepvoice일때 if hp.speaker_embedding_size == 1: # hparams의 speaker_embedding_size값이 1일때 before_highway = get_embed( # def get_embed(inputs, num_inputs, embed_size, name): speaker_id, self. num_speakers, # speaker_id의 인덱스에 따라 embed_table값 리턴 hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] ############################################################## else: # hparams의 speaker_embedding_size값이 1이 아닐때 deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) # input:x, units:dim, 활성화함수로 softsign사용 # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30 # tf.layers.dense( inputs, units, activation) # inputs는 앞의 레이어를 정의 # units는 이 레이어에 크기를 정의 # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수 # dense는 히든레이어를 구현하는 함수이다. # https://bcho.tistory.com/1196 before_highway = deep_dense( speaker_embed, hp.enc_prenet_sizes[-1] ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2 ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size ) # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list speaker_embed = None # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다. elif hp.model_type == 'simple': # modeltype이 deepvoice가 아니라 simple일때 before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type) ) # multi-speaker model type이 아니라고 에러메세지 출력 else: # 스피커의 수가 1명이면 speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x ############## # Encoder (특수문자, 한글 자모음text를 숫자로) ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, # hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention (중요!) ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])