def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: # Reference encoder reference_embedding = reference_encoder( mel_targets, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), is_training=is_training) # Style token layer style_embedding = multi_head_attention( num_heads=hp.num_heads, queries=tf.expand_dims(reference_embedding, axis=1), # [N, 1, 128] memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256] num_units=128) else: # TODO Add support for reference mode and more effective style control during inference. # Randomly select style embedding from gst_tokens for simplicity. random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32) style_embedding = tf.nn.embedding_lookup( gst_tokens, random_index) # Add style embedding to every text encoder state, applying tanh to # compress both encoder state and style embedding to the same scale. encoder_outputs += tf.nn.tanh(style_embedding) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper( ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ResidualWrapper( ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) if is_training: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state)) else: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets tf.logging.info('Initialized Tacotron model. Dimensions: ') tf.logging.info(' embedding: %d' % embedded_inputs.shape[-1]) tf.logging.info(' prenet out: %d' % prenet_outputs.shape[-1]) tf.logging.info(' encoder out: %d' % encoder_outputs.shape[-1]) tf.logging.info(' attention out: %d' % attention_cell.output_size) tf.logging.info(' concat attn & out: %d' % concat_cell.output_size) tf.logging.info(' decoder cell out: %d' % decoder_cell.output_size) tf.logging.info(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) tf.logging.info(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) tf.logging.info(' postnet out: %d' % post_outputs.shape[-1]) tf.logging.info(' linear out: %d' % linear_outputs.shape[-1])
def build_decoder_cell(self): """构建解码器cell""" encoder_outputs = self.encoder_outputs encoder_last_state = self.encoder_last_state encoder_inputs_length = self.encoder_inputs_length if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # To use BeamSearchDecoder # encoder_outputs, encoder_last_state, encoder_inputs_length # needs to be tiled so that: # [batch_size, .., ..] -> [batch_size x beam_width, .., ..] if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_last_state = nest.map_structure( lambda s: seq2seq.tile_batch(s, self.beam_width), self.encoder_last_state) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) # 计算解码器的隐藏神经元数,如果编码器是 bidirectional 的 # 那么解码器的一些隐藏神经元应该乘2 num_units = self.hidden_units if self.bidirectional: num_units *= 2 # Building attention mechanism: Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 'Luong' style attention: https://arxiv.org/abs/1508.04025 if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=num_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # Building decoder_cell self.decoder_cell_list = [ self.build_single_cell(num_units, use_residual=True) for i in range(self.depth) ] decoder_initial_state = encoder_last_state def attn_decoder_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 """ if not self.attn_input_feeding: return inputs # Essential when use_residual=True hidden_units = self.hidden_units if self.bidirectional: hidden_units *= 2 input_layer = layers.Dense(hidden_units, dtype=tf.float32, use_bias=False, name='attn_input_feeding') return input_layer(array_ops.concat([inputs, attention], -1)) # AttentionWrapper wraps RNNCell with the attention_mechanism # Note: We implement Attention mechanism only on the top decoder layer self.decoder_cell_list[-1] = AttentionWrapper( cell=self.decoder_cell_list[-1], attention_mechanism=self.attention_mechanism, # attention_layer_size=self.hidden_units, attention_layer_size=num_units, cell_input_fn=attn_decoder_input_fn, initial_cell_state=encoder_last_state[-1], alignment_history=self.alignment_history, name='Attention_Wrapper') # To be compatible with AttentionWrapper, the encoder last state # of the top layer should be converted # into the AttentionWrapperState form # We can easily do this by calling AttentionWrapper.zero_state # Also if beamsearch decoding is used, # the batch_size argument in .zero_state # should be ${decoder_beam_width} times to the origianl batch_size # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的 batch_size = self.batch_size if not self.use_beamsearch_decode \ else self.batch_size * self.beam_width initial_state = [state for state in encoder_last_state] initial_state[-1] = self.decoder_cell_list[-1].zero_state( batch_size=batch_size, dtype=tf.float32) decoder_initial_state = tuple(initial_state) return MultiRNNCell(self.decoder_cell_list), decoder_initial_state
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None, reference_weight=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh( tf.tile( tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_weights, style_embeddings = style_attention.multi_head_attention( ) # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] elif reference_weight is not None: print("Use specific weight for GST.") specific_weights = tf.expand_dims(reference_weight, axis=0) specific_weights = tf.tile(specific_weights, [hp.num_heads, 1], name="specific_weights") # specific_weights = tf.tile(specific_weights, [hp.num_heads, 1]) # specific_weights = tf.nn.softmax(specific_weights, axis=-1, name="specific_weights") style_embeddings = tf.matmul(specific_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.expand_dims(style_embeddings, axis=0) style_embeddings = tf.tile(style_embeddings, [batch_size, 1, 1]) style_embeddings = tf.reshape( style_embeddings, shape=[batch_size, 1, hp.style_embed_depth]) style_weights = tf.expand_dims(specific_weights, axis=0) else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, axis=-1, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.expand_dims(style_embeddings, axis=0) style_embeddings = tf.tile(style_embeddings, [batch_size, 1, 1]) style_embeddings = tf.reshape( style_embeddings, shape=[batch_size, 1, hp.style_embed_depth]) style_weights = tf.expand_dims(random_weights, axis=0) # Add style embedding to every text encoder state style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] # encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) encoder_outputs = encoder_outputs + style_embeddings # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_weights = style_weights self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel self.reference_weight = reference_weight log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
encoder_final_state_c = tf.concat( (encoder_fw_final_state.c, encoder_bw_final_state.c), 1) encoder_final_state_h = tf.concat( (encoder_fw_final_state.h, encoder_bw_final_state.h), 1) encoder_final_state = LSTMStateTuple(c=encoder_final_state_c, h=encoder_final_state_h) #Shape: (batch_size, time_step, hidden_units) encoder_outputs = tf.transpose( tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2), [1, 0, 2]) decoder_cell = LSTMCell(hidden_units * 2) attention_mechanism = BahdanauAttention(attention_units, encoder_outputs) attention_cell = AttentionWrapper(decoder_cell, attention_mechanism) copynet_cell = CopyNetWrapper(attention_cell, encoder_outputs, input_ids, vocab_size, gen_vocab_size) decoder_initial_state = copynet_cell.zero_state( batch_size, tf.float32).clone(cell_state=attention_cell.zero_state( batch_size=batch_size, dtype=tf.float32)) helper = tf.contrib.seq2seq.TrainingHelper(targets_embedded, targets_lengths, time_major=True) #helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, tf.ones([batch_size], dtype=tf.int32), 0) decoder = tf.contrib.seq2seq.BasicDecoder(copynet_cell,
def initialize(self, inputs, inputs_jp, input_lengths, input_jp_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # [N, T_in, embed_depth=256] # Encoder #prenet_outputs = prenet(inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( inputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # print(inputs_jp.eval) # print(inputs.eval) # print(input_jp_lengths.eval) # print(input_lengths.eval) encoder_outputs_jp = encoder_cbhg_jp( inputs_jp, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training, hp.prenet_depths), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Attention JP attention_cell_jp = AttentionWrapper( DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training, hp.prenet_depths), BahdanauAttention(hp.attention_depth, encoder_outputs_jp), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell_jp = ConcatOutputAndAttentionWrapper( attention_cell_jp) # [N, T_in, 2*attention_depth=512] # 以上复制一份,对应修改为日语特征输入,记新的 concat_cell为concat_cell_jp,新增一行连接两个输出 # # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. print(type(concat_cell)) print(concat_cell_jp.output_size) encoder_out = tf.concat([concat_cell, concat_cell_jp], axis=-1) #connect chinese_outputs and japanese_outputs # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(encoder_out, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.inputs_jp = inputs_jp self.input_lengths = input_lengths self.input_jp_lengths = input_jp_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') #log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' encoder out jp: %d' % encoder_outputs_jp.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' attention out jp: %d' % attention_cell_jp.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' concat attn & out jp: %d' % concat_cell_jp.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None # linear_targets가 초기값(None)이면 False self.is_randomly_initialized = is_randomly_initialized # 초기값 False with tf.variable_scope('inference') as scope: # 'inference'라는 이름으로 묶음 hp = self._hparams batch_size = tf.shape(inputs)[ 0] # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수) # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf. float32, # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성 initializer=tf.truncated_normal_initializer(stddev=0.5) ) # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입 # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) # inputs의 인덱스에 따라 char_embed_table값 리턴 self.num_speakers = num_speakers if self.num_speakers > 1: # 다중화자일때 if hp.speaker_embedding_size != 1: # hparams의 speaker_embedding_size값이 1이 아닐때 speaker_embed_table = tf.get_variable( # 공유변수 생성 'speaker_embedding', # 'speaker_embedding'이라는 이름의 [self.num_speakers, hp.speaker_embedding_size], dtype=tf. float32, # num_speakers와 speaker_embedding_size에 속해있는 initializer=tf.truncated_normal_initializer( stddev=0.5)) # 초기화값 가중치 # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id ) # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor) ############################################################## 추가설명 필요 if hp.model_type == 'deepvoice': # deepvoice일때 if hp.speaker_embedding_size == 1: # hparams의 speaker_embedding_size값이 1일때 before_highway = get_embed( # def get_embed(inputs, num_inputs, embed_size, name): speaker_id, self. num_speakers, # speaker_id의 인덱스에 따라 embed_table값 리턴 hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] ############################################################## else: # hparams의 speaker_embedding_size값이 1이 아닐때 deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) # input:x, units:dim, 활성화함수로 softsign사용 # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30 # tf.layers.dense( inputs, units, activation) # inputs는 앞의 레이어를 정의 # units는 이 레이어에 크기를 정의 # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수 # dense는 히든레이어를 구현하는 함수이다. # https://bcho.tistory.com/1196 before_highway = deep_dense( speaker_embed, hp.enc_prenet_sizes[-1] ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2 ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size ) # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list speaker_embed = None # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다. elif hp.model_type == 'simple': # modeltype이 deepvoice가 아니라 simple일때 before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type) ) # multi-speaker model type이 아니라고 에러메세지 출력 else: # 스피커의 수가 1명이면 speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x ############## # Encoder (특수문자, 한글 자모음text를 숫자로) ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, # hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention (중요!) ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def _init(self): sequence = tf.placeholder(tf.int32, [None, None], name='sequence') targets = tf.placeholder(tf.int32, [None, None], name='targets') authors = tf.placeholder(tf.int32, [None, None], name='authors') batch_size = tf.shape(sequence)[0] sequence_lengths = tf.cast(tf.count_nonzero(sequence, axis=1), tf.int32) embedding = tf.Variable( tf.random_normal((self._vocab_size, self._embed_size))) context = tf.Variable( tf.random_normal((self._author_size, self._ctx_size))) embedded_sequence = tf.nn.embedding_lookup(embedding, sequence) embedded_authors = tf.nn.embedding_lookup(context, authors) one_hot_targets = tf.one_hot(targets, self._vocab_size) gpu = lambda x: str(x % self._num_gpu) if self._attn: mech = BahdanauAttention(self._attn_depth, embedded_sequence, sequence_lengths) attn_cell = lambda x: DeviceWrapper( AttentionWrapper(x, mech, self._attn_size), "/gpu:" + gpu(1)) else: attn_cell = lambda x: x if self._training: dropout = lambda x: DropoutWrapper(x, 1.0, 1.0 - self._dropout) else: dropout = lambda x: x if self._cell == 'lstm': base_cell = lambda x: dropout(BasicLSTMCell(x)) elif self._cell == 'gru': base_cell = lambda x: dropout(GRUCell(x)) context_cell = ContextWrapper( base_cell(self._cell_size), embedded_authors, ) #context_cell = base_cell(self._cell_size) bottom_cell = DeviceWrapper(attn_cell(context_cell), "/gpu:0") top_cells = [ DeviceWrapper(base_cell(self._cell_size), "/gpu:" + gpu(i)) for i in range(1, self._cell_num) ] cell = MultiRNNCell([bottom_cell] + top_cells) init_state = cell.zero_state(batch_size, tf.float32) if self._training: helper = TrainingHelper(embedded_sequence, sequence_lengths) else: helper = SampleEmbeddingHelper(embedding, sequence[:, 0], 1) dense = Dense(self._vocab_size, self._activation) decoder = BasicDecoder(cell, helper, init_state, dense) output, state, _ = dynamic_decode(decoder, swap_memory=True) logits = output.rnn_output loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=one_hot_targets) loss = tf.reduce_mean(loss) out = tf.nn.softmax(logits) return sequence, authors, targets, loss, out
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def add_prediction_op(self): encoder_embed_seq = embed_sequence( self.inputs, vocab_size=self.config.vocab_size + 2, embed_dim=self.config.embedding_size, scope='embed') decoder_input_embed_seq = embed_sequence( self.labels[:, :-1], vocab_size=self.config.vocab_size + 2, embed_dim=self.config.embedding_size, scope='embed', reuse=True) with tf.variable_scope('embed', reuse=True): embeddings = tf.get_variable('embeddings') encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( BasicLSTMCell(self.config.num_units, name="encoder"), encoder_embed_seq, dtype=tf.float32, sequence_length=self.lengths, ) if self.config.train: tiled_encoder_outputs = encoder_outputs tiled_encoder_final_state = encoder_final_state tiled_sequence_length = self.lengths else: tiled_encoder_outputs = tile_batch( encoder_outputs, multiplier=self.config.beam_width) tiled_encoder_final_state = tile_batch( encoder_final_state, multiplier=self.config.beam_width) tiled_sequence_length = tile_batch( self.lengths, multiplier=self.config.beam_width) attention_mechanism = BahdanauAttention( num_units=self.config.num_units, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) attn_cell = AttentionWrapper( BasicLSTMCell(self.config.num_units, name="decoder"), attention_mechanism, attention_layer_size=self.config.num_units / 2) if self.config.train: batch_size = self.config.batch_size else: batch_size = self.config.batch_size * self.config.beam_width decoder_initial_state = attn_cell.zero_state(dtype=tf.float32, batch_size=batch_size) decoder_initial_state = decoder_initial_state.clone( cell_state=tiled_encoder_final_state) output_layer = tf.layers.Dense(self.config.vocab_size + 2, use_bias=True, name='output_projection') if self.config.train: training_helper = TrainingHelper(inputs=decoder_input_embed_seq, sequence_length=self.lengths, name='training_helper') decoder = BasicDecoder(cell=attn_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) else: def embed_and_input_proj(inputs): return tf.nn.embedding_lookup(embeddings, inputs) start_tokens = tf.ones([ self.config.batch_size, ], tf.int32) * (self.config.vocab_size + 1) decoder = BeamSearchDecoder( cell=attn_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=self.config.vocab_size, initial_state=decoder_initial_state, beam_width=self.config.beam_width, output_layer=output_layer, ) if self.config.train: decoder_outputs, _, _ = dynamic_decode( decoder=decoder, impute_finished=True, maximum_iterations=self.config.max_sequence_length + 1) pred_logits = tf.identity(decoder_outputs.rnn_output, name="prediction") else: decoder_outputs, _, _ = dynamic_decode( decoder=decoder, impute_finished=False, maximum_iterations=self.config.max_sequence_length + 1) pred_logits = tf.identity(decoder_outputs.predicted_ids, name="prediction") return pred_logits
def initialize(self, inputs, input_lengths, target_lengths, prefixes=None, speaker_ids=None, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # inputs # [N, T_in, D_input] speaker_embedding_table = tf.get_variable( 'speaker_embedding_table', [hp.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) speaker_embedding = tf.nn.embedding_lookup( speaker_embedding_table, speaker_ids) # [N, T_in, hp.speaker_embedding_size] deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embedding, 128) encoder_rnn_init_state = deep_dense(speaker_embedding, 128 * 2) attention_rnn_init_state = deep_dense(speaker_embedding, 256) decoder_rnn_init_states = [ deep_dense(speaker_embedding, 256) for _ in range(2) ] # Encoder prenet_outputs = prenet(inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state ) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # initially, decoder_init_state is a tuple, so we firstly convert it into a list, # decoder_init_state[0] is the projection wrapper, its initial state should be zero state # finally, convert list state into tuple decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.speaker_ids = speaker_ids self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.target_lengths = target_lengths self.prefixes = prefixes log('Initialized Tacotron model. Dimensions: ') log(' inputs: %d' % inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def build_decoder_cell(self, encoder_outputs, encoder_states): ''' 构建解码器的cell,返回一个解码器的cell和解码器初始化状态。 :param encoder_outputs: :param encoder_state: :return: ''' encoder_input_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_states = encoder_states[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) assert encoder_input_length is not None, 'encoder_state_length 不能为空' assert isinstance(batch_size, int), 'batchsize的值必须为int类型' assert encoder_outputs is not None, 'encoder_outputs is not None' assert encoder_states is not None, 'encoder_state is not None' #########################使用beamsearch的情况##################################################### if self.use_beamsearch_decode: '''这个tile_batch 会将tensor复制self.beam_with 份,相当于是 batch的数据变成了原来的self.beam_width 倍 ''' encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_states = seq2seq.tile_batch(encoder_states, multiplier=self.beam_width) encoder_input_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) #如果使用了beamsearch,那么输入应该是beam_width的倍数乘以batch_size batch_size *= self.beam_width #########################使用beamsearch的情况##################################################### #########################使用注意力机制########################################################### if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length) else: self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_input_length ) #双向LSTM的话encoder_outputs 就是它的隐藏状态h1 #########################使用注意力机制########################################################### cell = MultiRNNCell([ self.build_single_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) #这个cell就是多层的。 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) #alignment_history在不是训练状态以及没有使用beamsearch的时候使用。 def cell_input_fn(inputs, attention): ''' 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算 使用注意力机制才会进行的运算 :param inputs: :param attention: :return: ''' if not self.use_residual: print(inputs.get_shape, 'inputs_shape') print(attention.get_shape, 'inputs_shape') print(array_ops.concat([inputs, attention], -1), 'inputs和attention拼接之后的形状') return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') ''' 这个attn_projection(array_ops.concat([inputs,attention],-1))我的理解就是 layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn')(array_ops.concat([inputs,attention],-1)) Dense最终继承了Layer类,Layer中定义了call方法和__call__ 方法,Dense也重写了call方法,__call__方法中调用call方法,call方法中还是起一个全连接层层的作用,__call__ 方法中执行流程是:pre process,call,post process ''' return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, #这个是attention的历史信息 cell_input_fn=cell_input_fn, #将attention拼接起来和input拼接起来 name='Attention_Wrapper') #AttentionWrapper 注意力机制的包裹器 decoder_initial_state = cell.zero_state( batch_size, tf.float32) #这里初始化decoder_inital_state #传递encoder的状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_states) return cell, decoder_initial_state
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder, prenet_size=[256, 128] prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_size) # [N, T_in, prenet_size[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_output_size=256] output_size=hp.encoder_output_size) # Attention_RNN 用target与encoder_output计算attention attention_cell = AttentionWrapper( # input_size = 128, output_size = 256 cell=GRUCell(num_units=hp.attention_depth ), # 输出size=attention_depth=256 # input_size = output_size = 256 attention_mechanism=BahdanauAttention( num_units=hp.attention_depth, memory=encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # attention_RNN前加入prenet, prenet_size=[256, 128], prenet_output_size=128 attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_size) # 将attention context vector和RNN cell output进行拼接 concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # DecodeRNN为2层残差RNN (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper( cell=concat_cell, output_size=hp.decoder_depth), # 512 -> 256 ResidualWrapper(cell=GRUCell(hp.decoder_depth)), ResidualWrapper(cell=GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (预测outputs_per_step帧): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # help决定下个时刻的输入和初始输入 if is_training: helper = TacoTrainingHelper(inputs=inputs, targets=mel_targets, output_dim=hp.num_mels, r=hp.outputs_per_step) else: helper = TacoTestHelper(batch_size=batch_size, output_dim=hp.num_mels, r=hp.outputs_per_step) # 解码:预测不重叠的帧, 例如r->(r+1,2r), 2r->(2r+1,3r).... (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), # 打包成解码器 maximum_iterations=hp.max_iters) # [N, T_out/r, num_mels*r ] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M=80] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_output_size=256] output_size=hp.postnet_output_size) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F=1025] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def attention_decoder(inputs, memory, num_units=None, batch_size=1, inputs_length=None, n_mels=80, reduction=1, default_max_iters=200, is_training=True, scope='attention_decoder', reuse=None): """ Applies a GRU to 'inputs', while attending 'memory'. :param inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs. :param memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network. :param num_units: An int. Attention size. :param batch_size: An int. Batch size. :param inputs_length: An int. Memory length. :param n_mels: An int. Number of Mel banks to generate. :param reduction: An int. Reduction factor. Paper => 2, 3, 5. :param default_max_iters: Default max iteration of decoding. :param is_training: running mode. :param scope: Optional scope for `variable_scope`. :param reuse: Boolean, whether to reuse the weights of a previous layer by the same name. :return: A 3d tensor with shape of [N, T, num_units]. """ with tf.variable_scope(scope, reuse=reuse): # params setting if is_training: max_iters = None else: max_iters = default_max_iters # max_iters = default_max_iters if num_units is None: num_units = inputs.get_shape().as_list()[-1] # Decoder cell decoder_cell = tf.nn.rnn_cell.GRUCell(num_units) # Attention # [N, T_in, attention_depth] attention_cell = AttentionWrapper(decoder_cell, BahdanauAttention(num_units, memory), alignment_history=True) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, 2*attention_depth] concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): # [N, T_in, decoder_depth] decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, num_units), ResidualWrapper(GRUCell(num_units)), ResidualWrapper(GRUCell(num_units)) ], state_is_tuple=True) # Project onto r mel spectrogram (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, n_mels * reduction) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: # helper = TacotronTrainingHelper(batch_size, n_mels, reduction, inputs) helper = tf.contrib.seq2seq.TrainingHelper( inputs=inputs, sequence_length=inputs_length, time_major=False) else: helper = TacotronInferenceHelper(batch_size, n_mels, reduction) decoder = BasicDecoder(output_cell, helper, decoder_init_state) # [N, T_out/r, M*r] (decoder_outputs, _), final_decoder_state, _ = dynamic_decode( decoder, maximum_iterations=max_iters) return decoder_outputs, final_decoder_state
def build_decoder_cell(self, encoder_outputs, encoder_state): """构建解码器cell""" encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) # 使用 BeamSearchDecoder 的时候,必须根据 beam_width 来成倍的扩大一些变量 # encoder_outputs, encoder_state, encoder_inputs_length # needs to be tiled so that: # [batch_size, .., ..] -> [batch_size x beam_width, .., ..] if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch( encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) # 如果使用了 beamsearch 那么输入应该是 beam_width 倍于 batch_size 的 batch_size *= self.beam_width # 下面是两种不同的 Attention 机制 if self.attention_type.lower() == 'luong': # 'Luong' style attention: https://arxiv.org/abs/1508.04025 self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length ) else: # Default Bahdanau # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length ) # Building decoder_cell cell = MultiRNNCell([ self.build_single_cell( self.hidden_units, use_residual=self.use_residual ) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = ( self.mode != 'train' and not self.use_beamsearch_decode ) def cell_input_fn(inputs, attention): """根据attn_input_feeding属性来判断是否在attention计算前进行一次投影计算 """ if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='Attention_Wrapper') # 空状态 decoder_initial_state = cell.zero_state( batch_size, tf.float32) # 传递encoder状态 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) # if self.use_beamsearch_decode: # decoder_initial_state = seq2seq.tile_batch( # decoder_initial_state, multiplier=self.beam_width) return cell, decoder_initial_state
def _create_decoder_cell(self): enc_outputs, enc_states, enc_seq_len = self.enc_outputs, self.enc_states, self.enc_seq_len if self.use_beam_search: enc_outputs = tile_batch(enc_outputs, multiplier=self.cfg.beam_size) enc_states = nest.map_structure( lambda s: tile_batch(s, self.cfg.beam_size), enc_states) enc_seq_len = tile_batch(self.enc_seq_len, multiplier=self.cfg.beam_size) batch_size = self.batch_size * self.cfg.beam_size if self.use_beam_search else self.batch_size with tf.variable_scope("attention"): if self.cfg.attention == "luong": # Luong attention mechanism attention_mechanism = LuongAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) else: # default using Bahdanau attention mechanism attention_mechanism = BahdanauAttention( num_units=self.cfg.num_units, memory=enc_outputs, memory_sequence_length=enc_seq_len) def cell_input_fn( inputs, attention ): # define cell input function to keep input/output dimension same # reference: https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/AttentionWrapper if not self.cfg.use_attention_input_feeding: return inputs input_project = tf.layers.Dense(self.cfg.num_units, dtype=tf.float32, name='attn_input_feeding') return input_project(tf.concat([inputs, attention], axis=-1)) if self.cfg.top_attention: # apply attention mechanism only on the top decoder layer cells = [ self._create_rnn_cell() for _ in range(self.cfg.num_layers) ] cells[-1] = AttentionWrapper( cells[-1], attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states[-1], cell_input_fn=cell_input_fn) initial_state = [state for state in enc_states] initial_state[-1] = cells[-1].zero_state(batch_size=batch_size, dtype=tf.float32) dec_init_states = tuple(initial_state) cells = MultiRNNCell(cells) else: cells = MultiRNNCell( [self._create_rnn_cell() for _ in range(self.cfg.num_layers)]) cells = AttentionWrapper(cells, attention_mechanism=attention_mechanism, name="Attention_Wrapper", attention_layer_size=self.cfg.num_units, initial_cell_state=enc_states, cell_input_fn=cell_input_fn) dec_init_states = cells.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=enc_states) return cells, dec_init_states
def build_decoder_cell(self, encoder_outputs, encoder_state): """ 构建解码器cell """ encoder_inputs_length = self.encoder_inputs_length batch_size = self.batch_size if self.bidirectional: encoder_state = encoder_state[-self.depth:] if self.time_major: encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) if self.use_beamsearch_decode: encoder_outputs = seq2seq.tile_batch(encoder_outputs, multiplier=self.beam_width) encoder_state = seq2seq.tile_batch(encoder_state, multiplier=self.beam_width) encoder_inputs_length = seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_width) #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的 batch_size *= self.beam_width if self.attention_type.lower() == 'luong': self.attention_mechanism = LuongAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) else: #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。 self.attention_mechanism = BahdanauAttention( num_units=self.hidden_units, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) cell = MultiRNNCell([ self.build_signle_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 alignment_history = (self.mode != 'train' and not self.use_beamsearch_decode) def cell_input_fn(inputs, attention): """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算""" if not self.use_residual: return array_ops.concat([inputs, attention], -1) attn_projection = layers.Dense(self.hidden_units, dtype=tf.float32, use_bias=False, name='attention_cell_input_fn') return attn_projection(array_ops.concat([inputs, attention], -1)) attention_cell = AttentionWrapper( cell=cell, attention_mechanism=self.attention_mechanism, attention_layer_size=self.hidden_units, alignment_history=alignment_history, cell_input_fn=cell_input_fn, name='AttentionWrapper') # 空状态 decoder_initial_state = attention_cell.zero_state( batch_size, tf.float32) #传递encoder的状态 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) return attention_cell, decoder_initial_state
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "pml_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_mechanism = BahdanauAttention(hp.attention_depth, encoder_outputs) attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), attention_mechanism, alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized # get_variable() 사용 시, 'inference' scope 안에 있는 변수 가져옴 with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense( speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [deep_dense( speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet(char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2( hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper( batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg( mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('='*40) log(' model_type: %s' % hp.model_type) log('='*40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, vocab_size, positional_embeddings=False, beam_width=1, alignment_history=False): """ Initialize global variables and compute graph """ # vocabulary parameters # input image self.beam_width = beam_width self.attention_mode = 0 self.vocab_size = vocab_size self.learning_rate = tf.placeholder(tf.float32) self.input_image = tf.placeholder(tf.float32, shape=(None, 46, None, 1), name='img_data') self.batch_size = tf.shape(self.input_image)[0] # attention part placeholder self.att_label = tf.placeholder(tf.int32, shape=[None, None], name='att_label') self.att_train_length = tf.placeholder(tf.int32, shape=[None], name='att_train_length') # self.eight = tf.constant(8, dtype=tf.int32) # ctc part placeholder self.ctc_label = tf.sparse_placeholder(tf.int32, name='ctc_label') self.ctc_feature_length = tf.placeholder(tf.int32, shape=[None], name='ctc_feature_length') self.max_dec_iteration = tf.placeholder(tf.int32, shape=[1]) self.enc_lstm_dim = 256 self.dec_lstm_dim = 512 self.embedding_size = 512 self.ctc_loss_weights = 0.2 self.att_loss_weights = 1 - self.ctc_loss_weights self.wd = 0.00002 self.momentum = 0.9 self.embedding = tf.get_variable( "embedding", [self.vocab_size, self.embedding_size]) self.cnn_out, self.sequence_len = convnet_layers( self.input_image, self.ctc_feature_length, mode) self.enc_outputs = rnn_layers(self.cnn_out, self.sequence_len, self.enc_lstm_dim) attention_weights_depth = 2 * self.enc_lstm_dim attention_layer_size = 2 * self.enc_lstm_dim attention_states = tf.reshape( self.enc_outputs, [self.batch_size, -1, 2 * self.enc_lstm_dim]) attention_states_tiled = tile_batch( attention_states, self.beam_width) # For generalization attention_mechanism = BahdanauAttention(attention_weights_depth, attention_states_tiled) dec_lstm_cell = tf.nn.rnn_cell.LSTMCell(self.dec_lstm_dim) self.cell = AttentionWrapper(cell=dec_lstm_cell, attention_mechanism=attention_mechanism, attention_layer_size=attention_layer_size, alignment_history=alignment_history) self.setup_decoder() self.final_outputs, self.final_state, _ = dynamic_decode( self.decoder, maximum_iterations=self.max_dec_iteration[0] - 1) self.ctc_loss_branch() self.finalize_model()
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, 256), #ResidualWrapper(GRUCell(256)), #ResidualWrapper(GRUCell(256)) ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, txt_targets, txt_lengths, mel_targets, image_targets): with tf.variable_scope('inference') as scope: is_training = mel_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings for text embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_txt_inputs = tf.nn.embedding_lookup( embedding_table, txt_targets) # [N, T_in, 256] # Text Encoder prenet_outputs = prenet(embedded_txt_inputs, is_training) # [N, T_in, 128] txt_encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] self.z_txt # Speech Encoder speech_outputs = reference_encoder( mel_targets, filters=hp.reference_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 256] self.z_speech = speech_outputs # Image Encoder img_outputs = image_encoder('E', is_training=is_training, norm='batch', image_size=128) self.z_img = img_outputs def global_body(self, input): # Global computing body (share weights) # information fusion encoder self.z_fuse = info_encoder(input) # [N, 1, 256] # Global tokens (GST) gst_tokens = tf.get_variable( 'global_tokens', [hp.num_gst, hp.embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Attention attention = MultiheadAttention( tf.expand_dims(z_fuse, axis=1), # [N, 1, 256] tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) output = attention.multi_head_attention() # [N, 1, 256] self.uni_embedding = output return self.uni_embedding # Domain classification network domain_logit_txt = domain_classifier('D', is_training=is_training, norm='batch', info_encoder(self.z_txt)) domain_logit_img = domain_classifier('D', is_training=is_training, norm='batch', info_encoder(self.z_img)) domain_logit_speech = domain_classifier('D', is_training=is_training, norm='batch', info_encoder( self.z_speech)) # out of inference scope # Add style embedding to every text encoder state # Text Decoder scope with tf.variable_scope('text_decoder') as scope: attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, uni_embeddings, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] output_cell = OutputProjectionWrapper(decoder_cell, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) decoder_outputs, _ = tf.nn.dynamic_rnn( cell=output_cell, initial_state=decoder_init_state, maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] with tf.variable_scope('text_logits') as scope: txt_logit = tf.contrib.layers.fully_connected( inputs=decoder_outputs, num_outputs=self.config.vocab_size, activation_fn=None, weights_initializer=self.initializer, scope=logits_scope) # Image Decoder scope with tf.variable_scope('image_decoder') as scope: G = Generator('G', is_train=self.is_training, norm='batch', image_size=128) fake_img = G(uni_embeddings) # Speech Decoder scope with tf.variable_scope('speech_decoder') as scope: # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, uni_embeddings, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry fake_mel = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] self.txt_targets = txt_targets self.txt_lengths = txt_lengths self.mel_targets = mel_targets self.image_targets = image_targets self.txt_targets = txt_targets self.txt_logit = txt_logit self.fake_mel = fake_mel self.fake_img = fake_img