def fnet(self, mel, is_training=True, reuse=None): prenet_out = prenet(mel, num_units=[hp.hidden_units, hp.hidden_units // 2], dropout_rate=hp.dropout_rate, is_training=is_training, reuse=reuse) # (N, T, E/2) # CBHG1: mel-scale out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2, hp.num_highway_blocks, hp.norm_type, is_training, scope="fnet_cbhg1", reuse=reuse) mid = out out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2, hp.num_highway_blocks, hp.norm_type, is_training, scope="fnet_cbhg2", reuse=reuse) # Final linear projection logits = tf.layers.dense(out, hp.len_chinese_ppgs, trainable=is_training, reuse=reuse) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.t, name='ppgs') # (N, T, V) preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) decoded = tf.transpose(logits, perm=[1, 0, 2]) sequence_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(mel, reduction_indices=2), 0.), tf.int32), reduction_indices=1) decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False) decoded = tf.sparse_to_dense(decoded[0].indices,decoded[0].dense_shape,decoded[0].values) return mid, logits, ppgs, preds, decoded
def network(self, ppgs, is_training): # Pre-net prenet_out = prenet(ppgs, num_units=[hp.Train2.hidden_units, hp.Train2.hidden_units // 2], dropout_rate=hp.Train2.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training, scope="cbhg_mel") pred_mel = tf.layers.dense( pred_mel, self.y_mel.shape[-1], name='pred_mel') # (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense( pred_mel, hp.Train2.hidden_units // 2) # (N, T, n_mels) pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training, scope="cbhg_linear") # log magnitude: (N, T, 1+n_fft//2) pred_spec = tf.layers.dense( pred_spec, self.y_spec.shape[-1], name='pred_spec') return pred_spec, pred_mel
def gnet(self, feature, is_training=True, reuse=None): prenet_out = tf.layers.dense(feature, hp.hidden_units, reuse=reuse) prenet_out = prenet(prenet_out, num_units=[hp.hidden_units, hp.hidden_units], dropout_rate=hp.dropout_rate, is_training=is_training, reuse=reuse) # (N, T, E/2) # CBHG1: mel-scale pred_mel, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units, hp.num_highway_blocks, hp.norm_type, is_training, scope="cbhg_gnet_mel", reuse=reuse) g_mel = tf.layers.dense(pred_mel, self.x_mel.shape[-1], name='g_mel', reuse=reuse) # (N, T, n_mel) pred_spec = tf.layers.dense(g_mel, hp.hidden_units, reuse=reuse) # (N, T, n_mels) pred_spec, _ = cbhg(pred_spec, hp.num_banks, hp.hidden_units, hp.num_highway_blocks, hp.norm_type, is_training, scope="cbhg_gnet_spec", reuse=reuse) g_spec = tf.layers.dense(pred_spec, self.x_spec.shape[-1], name = 'g_spec', reuse=reuse) return g_spec, g_mel
def _net1(self): with tf.variable_scope('net1'): # Load vocabulary phn2idx, idx2phn = load_vocab() # Pre-net prenet_out = prenet(self.x_mfcc, num_units=[ hp.Train1.hidden_units, hp.Train1.hidden_units // 2 ], dropout_rate=hp.Train1.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.Train1.num_banks, hp.Train1.hidden_units // 2, hp.Train1.num_highway_blocks, hp.Train1.norm_type, self.is_training) # Final linear projection logits = tf.layers.dense(out, len(phn2idx)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.Train1.t) # (N, T, V) preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T) return ppgs, preds, logits
def __init__(self, hyperparams, is_training, inputs, input_lengths): # inputs: (batch, max_input_length) # input_lengths: (batch) # Embeddings char_embed_table = tf.get_variable( 'embedding', [hyperparams.num_symbols, hyperparams.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup(char_embed_table, inputs) # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = modules.prenet( char_embedded_inputs, is_training, layer_sizes=hyperparams.enc_prenet_sizes, drop_prob=hyperparams.dropout_prob, scope='prenet') encoder_outputs = modules.cbhg(prenet_outputs, input_lengths, is_training, hyperparams.enc_bank_size, hyperparams.enc_bank_channel_size, hyperparams.enc_maxpool_width, hyperparams.enc_highway_depth, hyperparams.enc_rnn_size, hyperparams.enc_proj_sizes, hyperparams.enc_proj_width, scope="encoder_cbhg") self.encoder_outputs = encoder_outputs
def network(self, ppgs, is_training): # Pre-net prenet_out = prenet( ppgs, num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2], dropout_rate=hp.train2.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG1: mel-scale # pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2, # hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, # scope="cbhg_mel") # pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # (N, T, n_mels) pred_mel = prenet_out # CBHG2: linear-scale out = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) out = cbhg(out, hp.train2.num_banks, hp.train2.hidden_units // 2, hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear") _, n_timesteps, n_bins = self.y_spec.get_shape().as_list() n_units = n_bins * hp.train2.n_mixtures out = tf.layers.dense(out, n_units * 3, bias_initializer=tf.random_uniform_initializer( minval=-3., maxval=3.)) mu = tf.nn.sigmoid(out[..., :n_units]) mu = tf.reshape( mu, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_var = tf.maximum(out[..., n_units:2 * n_units], -7.0) log_var = tf.reshape( log_var, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = tf.reshape( out[..., 2 * n_units:3 * n_units], shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = normalize(log_pi, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_pi') log_pi = tf.nn.log_softmax(log_pi) return mu, log_var, log_pi
def _net2(self): # PPGs from net1 ppgs, preds_ppg, logits_ppg = self._net1() with tf.variable_scope('net2'): # Pre-net prenet_out = prenet(ppgs, num_units=[ self.hparams.Train2.hidden_units, self.hparams.Train2.hidden_units // 2 ], dropout_rate=self.hparams.Train2.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg1") pred_mel = tf.layers.dense( pred_mel, self.y_mel.shape[-1]) # log magnitude: (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense(pred_mel, self.hparams.Train2.hidden_units // 2) # log magnitude: (N, T, n_mels) pred_spec = cbhg(pred_spec, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg2") pred_spec = tf.layers.dense( pred_spec, self.y_spec.shape[-1] ) # log magnitude: (N, T, 1+self.hparams.n_fft//2) return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
def _net2(self): # PPGs from net1 ppgs, preds_ppg, logits_ppg = self._net1() with tf.variable_scope('net2'): # Pre-net prenet_out = prenet(ppgs, num_units=[hp.Train2.hidden_units, hp.Train2.hidden_units // 2], dropout_rate=hp.Train2.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg1") pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # log magnitude: (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense(pred_mel, hp.Train2.hidden_units // 2) # log magnitude: (N, T, n_mels) pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg2") pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # log magnitude: (N, T, 1+hp.n_fft//2) return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
def network(self, x_mfcc, is_training): # Pre-net prenet_out = prenet(x_mfcc, num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2], dropout_rate=hp.train1.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2, hp.train1.num_highway_blocks, hp.train1.norm_type, is_training) # Final linear projection logits = tf.layers.dense(out, len(phns)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.train1.t, name='ppgs') # (N, T, V) preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) return ppgs, preds, logits
def test_cbhg(): batch_size = 32 # number of output features of pre-net in_channels = 128 time_steps = 15 inp = Variable(torch.ones(batch_size, time_steps, in_channels)).cuda() bank_k = 16 bank_ck = 128 proj_dims = (128, 128) highway_layers = 4 highway_units = 128 gru_units = 128 cbhg = CBHG(in_channels, bank_k, bank_ck, proj_dims, highway_layers, highway_units, gru_units).cuda() out = cbhg(inp) assert out.size() == (batch_size, time_steps, 2 * gru_units)
def _net1(self): with tf.variable_scope('net1'): # Load vocabulary phn2idx, idx2phn = load_vocab() # Pre-net prenet_out = prenet(self.x_mfcc, num_units=[hp.Train1.hidden_units, hp.Train1.hidden_units // 2], dropout_rate=hp.Train1.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.Train1.num_banks, hp.Train1.hidden_units // 2, hp.Train1.num_highway_blocks, hp.Train1.norm_type, self.is_training) # Final linear projection logits = tf.layers.dense(out, len(phn2idx)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.Train1.t) # (N, T, V) preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T) return ppgs, preds, logits
def __init__(self, hyperparams, is_training, encoder_outputs, mel_targets=None): # mel_targets: (batch, max_sample_length, num_mels) # encoder_outputs: (batch, max_sentence_length, enc_rnn_size * 2) batch_size = tf.shape(encoder_outputs)[0] #GRU = tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell GRU = tf.contrib.rnn.GRUCell dec_prenet = modules.DecoderPrenetWrapper( GRU(hyperparams.attention_state_size), is_training, hyperparams.dec_prenet_sizes, hyperparams.dropout_prob) attention_mechanism = tf.contrib.seq2seq.BahdanauMonotonicAttention( hyperparams.attention_size, encoder_outputs, normalize=True, score_bias_init=4.) attention_cell = tf.contrib.seq2seq.AttentionWrapper( dec_prenet, attention_mechanism, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = modules.ConcatOutputAndAttentionWrapper(attention_cell) # Synthesis model for inference cells = [concat_cell] for layer_index in range(hyperparams.dec_layer_num): cell = GRU(hyperparams.dec_rnn_size) if layer_index == 0: cells.append(cell) else: cells.append(tf.contrib.rnn.ResidualWrapper(cell)) # [N, T_in, 256] decoder_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) # GRU layers + linear projection # Weights proj_input_size = hyperparams.dec_rnn_size proj_output_size = hyperparams.num_mels * hyperparams.reduction_factor decoder_proj_weights = tf.get_variable( 'decoder_proj_weights', shape=[proj_input_size, proj_output_size], initializer=tf.contrib.layers.xavier_initializer()) if is_training: # Training Model for speed r = hyperparams.reduction_factor pre_padded_mel = tf.pad(mel_targets[:, r - 1:-r + 1:r], [[0, 0], [1, 0], [0, 0]]) gru_outputs, states = tf.nn.dynamic_rnn(decoder_cell, pre_padded_mel, dtype=tf.float32, swap_memory=True, scope='rnn') decoder_outputs = tf.matmul( tf.reshape(gru_outputs, (-1, hyperparams.dec_rnn_size)), decoder_proj_weights) # Grab alignments (N, T_out, T_in) self.alignments = tf.transpose(states[0].alignment_history.stack(), (1, 0, 2)) else: proj_decoder_cell = modules.OutputProjectionWrapper( decoder_cell, decoder_proj_weights) # Synthesis model for inference helper = modules.TacoTestHelper(batch_size, hyperparams.num_mels, hyperparams.reduction_factor) decoder_init_state = proj_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), self.final_state, _ = \ tf.contrib.seq2seq.dynamic_decode( tf.contrib.seq2seq.BasicDecoder(proj_decoder_cell, helper, decoder_init_state), maximum_iterations=hyperparams.max_iters, swap_memory=True, scope='rnn') # Grab alignments from the final decoder state: self.alignments = tf.transpose( self.final_state[0].alignment_history.stack(), (1, 0, 2)) # [N, T_out, M] self.mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hyperparams.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] post_outputs = modules.cbhg(self.mel_outputs, None, is_training, hyperparams.post_bank_size, hyperparams.post_bank_channel_size, hyperparams.post_maxpool_width, hyperparams.post_highway_depth, hyperparams.post_rnn_size, hyperparams.post_proj_sizes, hyperparams.post_proj_width, scope='post_cbhg') self.linear_outputs = tf.layers.dense( post_outputs, hyperparams.num_freq) # [N, T_out, F]
def __init__(self, inp, inp_mask, decode_time_steps, hyper_params=None, name='Tacotron'): """ Build the computational graph. :param inp: :param inp_mask: :param decode_time_steps: :param hyper_params: :param name: """ super(Tacotron, self).__init__(name) self.hyper_params = HyperParams( ) if hyper_params is None else hyper_params with tf.variable_scope(name): self.global_step = tf.Variable(0, name='global_step', trainable=False) batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] reduc = self.hyper_params.reduction_rate output_time_steps = decode_time_steps * reduc ### Encoder [begin] with tf.variable_scope('character_embedding'): embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp) with tf.variable_scope('encoder_pre_net'): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=False) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=False) encoder_output = modules.cbhg(pre_ed_inp, training=False, k=16, bank_filters=128, projection_filters=(128, 128), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=inp_mask, name='encoder_cbhg', reuse=False) ### Encoder [end] ### Attention Module with tf.variable_scope('attention'): att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False) ### Decoder [begin] att_cell = GRUCell(256) dec_cell = MultiRNNCell( [ResidualWrapper(GRUCell(256)) for _ in range(2)]) # prepare output alpha TensorArray with tf.variable_scope('prepare_decode'): # prepare output alpha TensorArray reduced_time_steps = tf.div(output_time_steps, reduc) init_att_cell_state = att_cell.zero_state( batch_size, tf.float32) init_dec_cell_state = dec_cell.zero_state( batch_size, tf.float32) init_state_tup = tuple( [init_att_cell_state, init_dec_cell_state]) init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) go_array = tf.zeros( [batch_size, self.hyper_params.seq2seq_dim], dtype=tf.float32) init_context = tf.zeros([batch_size, 256], dtype=tf.float32) init_time = tf.constant(0, dtype=tf.int32) cond = lambda x, *_: tf.less(x, reduced_time_steps) def body(this_time, old_output_ta, old_alpha_ta, old_state_tup, last_context, last_output): with tf.variable_scope('decoder_pre_net'): dec_pre_ed_inp = last_output dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 256, tf.nn.relu), training=False) dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 128, tf.nn.relu), training=False) with tf.variable_scope('attention_rnn'): att_cell_inp = tf.concat([last_context, dec_pre_ed_inp], axis=-1) att_cell_out, att_cell_state = att_cell( att_cell_inp, old_state_tup[0]) with tf.variable_scope('attention'): query = att_cell_state[0] context, alpha = att_module(query) new_alpha_ta = old_alpha_ta.write(this_time, alpha) with tf.variable_scope('decoder_rnn'): dec_input = tf.layers.dense( tf.concat([att_cell_out, context], axis=-1), 256) dec_cell_out, dec_cell_state = dec_cell( dec_input, old_state_tup[1]) dense_out = tf.layers.dense( dec_cell_out, self.hyper_params.seq2seq_dim * reduc) new_output_ta = old_output_ta.write(this_time, dense_out) new_output = dense_out[:, -self.hyper_params.seq2seq_dim:] new_state_tup = tuple([att_cell_state, dec_cell_state]) return tf.add( this_time, 1 ), new_output_ta, new_alpha_ta, new_state_tup, context, new_output # run loop _, seq2seq_output_ta, alpha_ta, *_ = tf.while_loop( cond, body, [ init_time, init_output_ta, init_alpha_ta, init_state_tup, init_context, go_array ]) with tf.variable_scope('reshape_decode'): seq2seq_output = tf.reshape( seq2seq_output_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc)) seq2seq_output = tf.reshape( tf.transpose(seq2seq_output, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim)) self.seq2seq_output = seq2seq_output alpha_output = tf.reshape(alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) alpha_output = tf.expand_dims( tf.transpose(alpha_output, perm=(1, 0, 2)), -1) self.alpha_output = alpha_output ### Decoder [end] ### PostNet [begin] post_output = modules.cbhg( seq2seq_output, training=False, k=8, bank_filters=128, projection_filters=(256, self.hyper_params.seq2seq_dim), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=None, name='decoder_cbhg', reuse=False) post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform') self.post_output = post_output