def __init__(self, hyperparams, is_training, inputs, input_lengths): # inputs: (batch, max_input_length) # input_lengths: (batch) # Embeddings char_embed_table = tf.get_variable( 'embedding', [hyperparams.num_symbols, hyperparams.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup(char_embed_table, inputs) # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = modules.prenet( char_embedded_inputs, is_training, layer_sizes=hyperparams.enc_prenet_sizes, drop_prob=hyperparams.dropout_prob, scope='prenet') encoder_outputs = modules.cbhg(prenet_outputs, input_lengths, is_training, hyperparams.enc_bank_size, hyperparams.enc_bank_channel_size, hyperparams.enc_maxpool_width, hyperparams.enc_highway_depth, hyperparams.enc_rnn_size, hyperparams.enc_proj_sizes, hyperparams.enc_proj_width, scope="encoder_cbhg") self.encoder_outputs = encoder_outputs
def decode2(inputs, is_training=True, scope="decoder2", reuse=None): ''' Args: inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Log magnitude spectrogram of sound files. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted magnitude spectrogram tensor with shape of [N, T', C''], where C'' = (1+hp.n_fft//2)*hp.r. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T'', E/2) # Decoder Post-processing net = CBHG ## Conv1D bank dec = mod.conv1d_banks(prenet_out, K=hp.decoder_num_banks, is_training=is_training) # (N, T', E*K/2) ## Max pooling dec = tf.layers.max_pooling1d(dec, 2, 1, padding="same") # (N, T', E*K/2) ## Conv1D projections dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1") # (N, T', E) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") dec = mod.conv1d(dec, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T', E/2) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") dec += prenet_out ## Highway Nets for i in range(4): dec = mod.highwaynet( dec, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ## Bidirectional GRU dec = mod.gru(dec, hp.embed_size // 2, True) # (N, T', E) # Outputs => (N, T', (1+hp.n_fft//2)*hp.r) out_dim = (1 + hp.n_fft // 2) * hp.r outputs = tf.layers.dense(dec, out_dim) return outputs
def fnet(self, mel, is_training=True, reuse=None): prenet_out = prenet(mel, num_units=[hp.hidden_units, hp.hidden_units // 2], dropout_rate=hp.dropout_rate, is_training=is_training, reuse=reuse) # (N, T, E/2) # CBHG1: mel-scale out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2, hp.num_highway_blocks, hp.norm_type, is_training, scope="fnet_cbhg1", reuse=reuse) mid = out out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2, hp.num_highway_blocks, hp.norm_type, is_training, scope="fnet_cbhg2", reuse=reuse) # Final linear projection logits = tf.layers.dense(out, hp.len_chinese_ppgs, trainable=is_training, reuse=reuse) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.t, name='ppgs') # (N, T, V) preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) decoded = tf.transpose(logits, perm=[1, 0, 2]) sequence_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(mel, reduction_indices=2), 0.), tf.int32), reduction_indices=1) decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False) decoded = tf.sparse_to_dense(decoded[0].indices,decoded[0].dense_shape,decoded[0].values) return mid, logits, ppgs, preds, decoded
def _net1(self): with tf.variable_scope('net1'): # Load vocabulary phn2idx, idx2phn = load_vocab() # Pre-net prenet_out = prenet(self.x_mfcc, num_units=[ hp.Train1.hidden_units, hp.Train1.hidden_units // 2 ], dropout_rate=hp.Train1.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.Train1.num_banks, hp.Train1.hidden_units // 2, hp.Train1.num_highway_blocks, hp.Train1.norm_type, self.is_training) # Final linear projection logits = tf.layers.dense(out, len(phn2idx)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.Train1.t) # (N, T, V) preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T) return ppgs, preds, logits
def gnet(self, feature, is_training=True, reuse=None): prenet_out = tf.layers.dense(feature, hp.hidden_units, reuse=reuse) prenet_out = prenet(prenet_out, num_units=[hp.hidden_units, hp.hidden_units], dropout_rate=hp.dropout_rate, is_training=is_training, reuse=reuse) # (N, T, E/2) # CBHG1: mel-scale pred_mel, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units, hp.num_highway_blocks, hp.norm_type, is_training, scope="cbhg_gnet_mel", reuse=reuse) g_mel = tf.layers.dense(pred_mel, self.x_mel.shape[-1], name='g_mel', reuse=reuse) # (N, T, n_mel) pred_spec = tf.layers.dense(g_mel, hp.hidden_units, reuse=reuse) # (N, T, n_mels) pred_spec, _ = cbhg(pred_spec, hp.num_banks, hp.hidden_units, hp.num_highway_blocks, hp.norm_type, is_training, scope="cbhg_gnet_spec", reuse=reuse) g_spec = tf.layers.dense(pred_spec, self.x_spec.shape[-1], name = 'g_spec', reuse=reuse) return g_spec, g_mel
def network(self, ppgs, is_training): # Pre-net prenet_out = prenet(ppgs, num_units=[hp.Train2.hidden_units, hp.Train2.hidden_units // 2], dropout_rate=hp.Train2.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training, scope="cbhg_mel") pred_mel = tf.layers.dense( pred_mel, self.y_mel.shape[-1], name='pred_mel') # (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense( pred_mel, hp.Train2.hidden_units // 2) # (N, T, n_mels) pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, is_training, scope="cbhg_linear") # log magnitude: (N, T, 1+n_fft//2) pred_spec = tf.layers.dense( pred_spec, self.y_spec.shape[-1], name='pred_spec') return pred_spec, pred_mel
def test_prenet(): fc1_hidden_size = 256 fc2_hidden_size = 128 # simulate pre-net in decoder batch_size = 32 input_size = 80 input = Variable(torch.randn(batch_size, 1, input_size)) prenet = PreNet(input_size, fc1_hidden_size=fc1_hidden_size, fc2_hidden_size=fc2_hidden_size) output = prenet(input) assert output.size() == (batch_size, 1, fc2_hidden_size) # simulate pre-net in encoder batch_size = 32 embedding_size = 256 time_steps = 17 input2 = Variable(torch.randn(batch_size, time_steps, embedding_size)) prenet2 = PreNet(embedding_size, fc1_hidden_size=fc1_hidden_size, fc2_hidden_size=fc2_hidden_size) output2 = prenet2(input2) assert output2.size() == (batch_size, time_steps, fc2_hidden_size)
def call(self, inputs, state): prenet_out = prenet(inputs, self._is_training, self._layer_sizes, scope='decoder_prenet') # cell(...) calls the __call() method of RNNCell class # as _cell is a type of RNNCell return self._cell(prenet_out, state)
def decode( inputs, memory, is_training = True, scope = 'decoder_layers', reuse = None ): with tf.variable_scope(scope, reuse = reuse): dec = prenet(inputs, is_training = is_training) dec = attention_decoder(dec, memory, embed_size) dec += gru(dec, embed_size, False, scope = 'gru1') dec += gru(dec, embed_size, False, scope = 'gru2') return tf.layers.dense(dec, len(char2idx))
def network(self, ppgs, is_training): # Pre-net prenet_out = prenet( ppgs, num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2], dropout_rate=hp.train2.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG1: mel-scale # pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2, # hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, # scope="cbhg_mel") # pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # (N, T, n_mels) pred_mel = prenet_out # CBHG2: linear-scale out = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) out = cbhg(out, hp.train2.num_banks, hp.train2.hidden_units // 2, hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear") _, n_timesteps, n_bins = self.y_spec.get_shape().as_list() n_units = n_bins * hp.train2.n_mixtures out = tf.layers.dense(out, n_units * 3, bias_initializer=tf.random_uniform_initializer( minval=-3., maxval=3.)) mu = tf.nn.sigmoid(out[..., :n_units]) mu = tf.reshape( mu, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_var = tf.maximum(out[..., n_units:2 * n_units], -7.0) log_var = tf.reshape( log_var, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = tf.reshape( out[..., 2 * n_units:3 * n_units], shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = normalize(log_pi, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_pi') log_pi = tf.nn.log_softmax(log_pi) return mu, log_var, log_pi
def network(self, x_mfcc, is_training): # Pre-net prenet_out = prenet(x_mfcc, num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2], dropout_rate=hp.train1.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2, hp.train1.num_highway_blocks, hp.train1.norm_type, is_training) # Final linear projection logits = tf.layers.dense(out, len(phns)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.train1.t, name='ppgs') # (N, T, V) preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) return ppgs, preds, logits
def encode(inputs, is_training = True, scope = 'encoder', reuse = None): with tf.variable_scope(scope, reuse = reuse): prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training) enc = conv1d_banks( prenet_out, K = encoder_num_banks, is_training = is_training ) enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same') enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc += prenet_out for i in range(num_highway_blocks): enc = highwaynet( enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i) ) memory = gru(enc, embed_size // 2, True) return memory
def pre_decoder(self, inputs, memory, is_training=False, scope="pre-decoder", reuse=None): """ Pre Decoder :param inputs: A 3D Tensor with shape of [N, T_y / r, n_mels(*r)], with dtype of intxx. :param memory: A 3D Tensor with shape of [N, T_x, E]. :param is_training: A boolean. :param scope: A str, Optional scope for 'variable_scope'. :param reuse: A boolean. Whether to reuse the weights of a previous layer by the same name. :return: """ with tf.variable_scope(scope, reuse=reuse): # Decoder PreNet prenet_dec = prenet(inputs, is_training=is_training) # Decoder Attention dec, state = attention_decoder(prenet_dec, memory, num_units=self.embed_size) alignments = tf.transpose(state.alignment_history.stack(), [1, 2, 0]) # Decoder stacked GRU dec += biGRU(dec, num_units=self.embed_size, bidirection=False, scope="GRU-1") dec += biGRU(dec, num_units=self.embed_size, bidirection=False, scope="GRU-2") mel_hats = tf.layers.dense(dec, units=self.n_mels * self.reduction_factor, kernel_initializer=_init, kernel_regularizer=_reg) return mel_hats, alignments
def _net2(self): # PPGs from net1 ppgs, preds_ppg, logits_ppg = self._net1() with tf.variable_scope('net2'): # Pre-net prenet_out = prenet(ppgs, num_units=[ self.hparams.Train2.hidden_units, self.hparams.Train2.hidden_units // 2 ], dropout_rate=self.hparams.Train2.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg1") pred_mel = tf.layers.dense( pred_mel, self.y_mel.shape[-1]) # log magnitude: (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense(pred_mel, self.hparams.Train2.hidden_units // 2) # log magnitude: (N, T, n_mels) pred_spec = cbhg(pred_spec, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg2") pred_spec = tf.layers.dense( pred_spec, self.y_spec.shape[-1] ) # log magnitude: (N, T, 1+self.hparams.n_fft//2) return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
def _net1(self): with tf.variable_scope('net1'): # Load vocabulary phn2idx, idx2phn = load_vocab() # Pre-net prenet_out = prenet(self.x_mfcc, num_units=[hp.Train1.hidden_units, hp.Train1.hidden_units // 2], dropout_rate=hp.Train1.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG out = cbhg(prenet_out, hp.Train1.num_banks, hp.Train1.hidden_units // 2, hp.Train1.num_highway_blocks, hp.Train1.norm_type, self.is_training) # Final linear projection logits = tf.layers.dense(out, len(phn2idx)) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.Train1.t) # (N, T, V) preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T) return ppgs, preds, logits
def _net2(self): # PPGs from net1 ppgs, preds_ppg, logits_ppg = self._net1() with tf.variable_scope('net2'): # Pre-net prenet_out = prenet(ppgs, num_units=[hp.Train2.hidden_units, hp.Train2.hidden_units // 2], dropout_rate=hp.Train2.dropout_rate, is_training=self.is_training) # (N, T, E/2) # CBHG1: mel-scale pred_mel = cbhg(prenet_out, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg1") pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # log magnitude: (N, T, n_mels) # CBHG2: linear-scale pred_spec = tf.layers.dense(pred_mel, hp.Train2.hidden_units // 2) # log magnitude: (N, T, n_mels) pred_spec = cbhg(pred_spec, hp.Train2.num_banks, hp.Train2.hidden_units // 2, hp.Train2.num_highway_blocks, hp.Train2.norm_type, self.is_training, scope="cbhg2") pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # log magnitude: (N, T, 1+hp.n_fft//2) return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
def decode1(decoder_inputs, memory, is_training=True, scope="decoder1", reuse=None): ''' Args: decoder_inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Shifted melspectrogram of sound files. memory: A 3d tensor with shape of [N, T, C], where C=hp.embed_size. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted melspectrogram tensor with shape of [N, T', C']. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net #ipdb.set_trace() dec = mod.prenet(decoder_inputs, is_training=is_training) # (N, T', E/2) # Attention RNN dec = mod.attention_decoder(dec, memory, num_units=hp.embed_size) # (N, T', E) # Decoder RNNs dec += mod.gru(dec, hp.embed_size, False, scope="decoder_gru1") # (N, T', E) dec += mod.gru(dec, hp.embed_size, False, scope="decoder_gru2") # (N, T', E) # Outputs => (N, T', hp.n_mels*hp.r) out_dim = decoder_inputs.get_shape().as_list()[-1] outputs = tf.layers.dense( dec, out_dim) # (N, None, E) output the same shape as input return outputs
def call(self, inputs, state): from modules import prenet prenet_out = prenet(inputs, self._is_training, scope="decoder_prenet") return self._cell(prenet_out, state)
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def encoder(self, inputs, use_highway_network=True, is_training=True, scope="encoder", reuse=None): """ Encoder :param inputs: A 2D Tensor with shape of [Seq, E], with dtype of intxx. :param use_highway_network: A boolean. Whether using highway network or not :param is_training: A boolean. :param scope: A str, Optional scope for 'variable_scope'. :param reuse: A boolean. Whether to reuse the weights of a previous layer by the same name. :return: """ with tf.variable_scope(scope, reuse=reuse): # Encoder PreNet prenet_enc = prenet(inputs, is_training=is_training) # Encoder Convolutional Block enc = conv1d_banks(prenet_enc, n_kernels=self.n_encoder_banks, is_training=is_training) enc = tf.layers.max_pooling1d(enc, pool_size=2, strides=1, padding='SAME') # Encoder PostNet enc = conv1d(enc, n_filters=self.embed_size // 2, kernel=3, scope="conv1d-proj-1") enc = batch_norm(enc, is_training=is_training, activation_fn=tf.nn.relu, scope="bn-proj-1") enc = conv1d(enc, n_filters=self.embed_size // 2, kernel=3, scope="conv1d-proj-2") enc = batch_norm(enc, is_training=is_training, activation_fn=tf.nn.relu, scope="bn-proj-2") enc += prenet_enc # long skip connection (LSC) # highway networks if use_highway_network: for i in range(self.n_highway_blocks): enc = highway_network(enc, num_units=self.embed_size // 2, scope="highway_network-%d" % i) memory = biGRU(enc, num_units=self.embed_size // 2, bidirection=True) return memory
def encode(inputs, is_training=True, scope="encoder", reuse=None): ''' Args: inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size T: real length seqlens: A 1d tensor with shape of [N,], dtype of int32. masks: A 3d tensor with shape of [N, T, 1], dtype of float32. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: E is the spectrogram filter N A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation ''' with tf.variable_scope(scope, reuse=reuse): # Load vocabulary #char2idx, idx2char = load_vocab() # Character Embedding N seqs #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256) # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout #ipdb.set_trace() inputs = mod.pre_spectro(inputs, is_training=is_training) # (N, T, E) prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = mod.conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2) ### Max pooling enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = mod.highwaynet( enc, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ### Bidirectional GRU---apply nonlineararity memory = mod.gru( enc, hp.embed_size // 2, False ) # (N, T, E) what the network represent the input text input return memory