def BLSTM(self, x_p, x_s, x_d, x_b, x_len, dropout, activation=tf.nn.tanh): '''p=pitch, s=start, d=duration, b=beat_type''' with tf.name_scope('Input_embedding'): x_p_onehot = tf.one_hot(x_p - self.lowest_pitch, depth=self.n_p_classes) x_b_onehot = tf.one_hot(x_b, depth=self.n_b_classes) input = tf.concat( [x_p_onehot, x_s[:, :, None], x_d[:, :, None], x_b_onehot], axis=2) input_embedded = tf.layers.dense(input, self.embedding_size) input_embedded = self.normalize(input_embedded, scope='input_ln') input_embedded = activation(input_embedded) input_embedded = tf.nn.dropout(input_embedded, keep_prob=1 - dropout) with tf.name_scope('BLSTM_cells'): cell_fw = LSTMBlockCell( num_units=self.hidden_size, name='cell_fw' ) # LSTMCell(num_units=hidden_size, name='cell_fw') cell_bw = LSTMBlockCell( num_units=self.hidden_size, name='cell_bw' ) # LSTMCell(num_units=hidden_size, name='cell_bw') with tf.name_scope('RNN'): # bi-LSTM (output_fw, output_bw), (_, _) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=input_embedded, sequence_length=x_len, dtype=tf.float32, time_major=False) hidden_states = tf.concat((output_fw, output_bw), axis=-1) hidden_states = self.normalize(hidden_states, scope='hidden_ln') hidden_states = activation(hidden_states) hidden_states = tf.nn.dropout(hidden_states, keep_prob=1 - dropout) with tf.name_scope('Output'): s_logits = tf.layers.dense(hidden_states, self.n_str_classes, name='string_out') p_logits = tf.layers.dense(hidden_states, self.n_pos_classes, name='position_out') return s_logits, p_logits
def __init__(self, hparams, is_training=False, with_target=True, reuse=False): self.with_target = with_target self.hparams = hparams self.inputs = tf.placeholder(tf.int32, (None, None), name='graphemes_ph') self.input_lengths = tf.placeholder(tf.int32, [None], name='grapeheme_seq_len_ph') if with_target: self.targets = tf.sparse_placeholder(tf.int32, name='phonemes_ph') with tf.variable_scope('g2p', reuse=reuse): embedding_table = tf.get_variable('embedding', [hparams.graphemes_num, hparams.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) outputs = tf.nn.embedding_lookup(embedding_table, self.inputs) if hparams.with_conv: for i in range(hparams.conv_num): outputs = conv1d(outputs, hparams.conv_width, hparams.conv_channels, tf.nn.relu, is_training, hparams.dropout_rate, 'conv_%d' % i) forward_cell = rnn_cell(hparams.lstm_units1//2, hparams, is_training) backward_cell = rnn_cell(hparams.lstm_units1//2, hparams, is_training) bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn( forward_cell, backward_cell, outputs, sequence_length=self.input_lengths, dtype=tf.float32, scope='bilstm') # Concatentate forward and backwards: bi_outputs = tf.concat(bi_outputs, axis=2) uni_cell = rnn_cell(hparams.lstm_units1, hparams, is_training) uni_outputs, _ = tf.nn.dynamic_rnn(uni_cell, outputs, sequence_length=self.input_lengths, dtype=tf.float32, scope='unilstm') outputs = tf.concat([bi_outputs, uni_outputs], axis=2) dropout_rate_cond = hparams.dropout_rate if is_training else 0.0 outputs = tf.layers.dropout(outputs, rate=dropout_rate_cond) outputs, _ = tf.nn.dynamic_rnn(LSTMBlockCell(hparams.lstm_units2), outputs, sequence_length=self.input_lengths, dtype=tf.float32, scope='lstm') self.logits = tf.layers.dense(outputs, hparams.phonemes_num) self.probs = tf.nn.softmax(self.logits, name='probs') logits_transp = tf.transpose(self.logits, (1, 0, 2)) self.decoded, self.seq_probs = tf.nn.ctc_beam_search_decoder( logits_transp, self.input_lengths, top_paths=self.hparams.nbest) self.decoded_best = tf.to_int32(tf.sparse_tensor_to_dense( self.decoded[0], default_value=hparams.phonemes_num-1), name='predicted_1best')
def _build_single_cell(cell_type, num_units, use_dropout, mode, dropout_probability, dtype, device=None): r""" :param num_units: `int` :return: """ if cell_type == 'lstm': cells = LSTMCell(num_units=num_units, use_peepholes=False, cell_clip=1.0, initializer=tf.variance_scaling_initializer(), dtype=dtype) elif cell_type == 'layernorm_lstm': cells = LayerNormLSTMCell(num_units=num_units, cell_clip=1.0) elif cell_type == 'layernorm_basiclstm': cells = LayerNormBasicLSTMCell(num_units=num_units) elif cell_type == 'gru': cells = GRUCell(num_units=num_units, kernel_initializer=tf.variance_scaling_initializer(), bias_initializer=tf.variance_scaling_initializer(), dtype=dtype) elif cell_type == 'ugrnn': cells = UGRNNCell(num_units) elif cell_type == 'lstm_block': cells = LSTMBlockCell(num_units=num_units, use_peephole=True, cell_clip=None) elif cell_type == 'gru_block': cells = GRUBlockCellV2(num_units=num_units) elif cell_type == 'nas': cells = NASCell(num_units=num_units) elif cell_type == 'lstm_masked': from tensorflow.contrib.model_pruning import MaskedLSTMCell cells = MaskedLSTMCell(num_units=num_units) else: raise Exception('cell type not supported: {}'.format(cell_type)) if use_dropout is True and mode == 'train': cells = DropoutWrapper( cells, input_keep_prob=dropout_probability[0], state_keep_prob=dropout_probability[1], output_keep_prob=dropout_probability[2], variational_recurrent=False, dtype=dtype, # input_size=self._inputs.get_shape()[1:], ) if device is not None: cells = DeviceWrapper(cells, device=device) return cells
def RNN(x, weights, biases): x = tf.unstack(x, timesteps, 1) lstm_cell = DropoutWrapper(LSTMBlockCell(n_hidden, forget_bias=1.0), variational_recurrent=True, input_size=x_train.shape[0], state_keep_prob=.7, output_keep_prob=.7, dtype=tf.float32) outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32) return tf.matmul(outputs[-1], weights['out']) + biases['out']
def rnn_cell(dim, hparams, is_training): if hparams.rnn_type == 'ln_lstm': keep_prob = (1 - hparams.dropout_rate) if is_training else 1.0 cell = LayerNormBasicLSTMCell(dim, dropout_keep_prob=keep_prob, layer_norm=True) elif hparams.rnn_type == 'zn_lstm': cell = LSTMBlockCell(dim) cell = ZoneoutWrapper(cell, hparams.zonout_prob, is_training) return cell
def attention_decoder(inputs, num_units, input_lengths, is_training, speaker_embd=None, attention_type="bah", scope="attention_decoder", reuse=None): with tf.variable_scope(scope, reuse=reuse): if attention_type == 'bah_mon': attention_mechanism = tf.contrib.seq2seq.BahdanauMonotonicAttention( num_units, inputs) elif attention_type == 'bah_norm': attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units, inputs, normalize=True) elif attention_type == 'luong_scaled': attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units, inputs, scale=True) elif attention_type == 'luong': attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units, inputs) elif attention_type == 'bah': # Bahdanau et al. attention mechanism attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units, # attention units inputs, memory_sequence_length=input_lengths) elif attention_type == "location_sensitive": attention_mechanism = LocationSensitiveAttention( num_units, inputs, memory_sequence_length=input_lengths) else: raise Exception("Unknown attention type ") # Attention if attention_type == "location_sensitive": pre_mechanism_cell = LSTMBlockCell(num_units) else: pre_mechanism_cell = GRUCell(num_units) # bottleneck prenet as in paper pre_mechanism = neural_speech.models.utils.rnn_wrappers.PrenetWrapper( pre_mechanism_cell, [256, 128], is_training, speaker_embd=speaker_embd) attention_cell = tf.contrib.seq2seq.AttentionWrapper( pre_mechanism, # 256 attention_mechanism, # 256 alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = neural_speech.models.utils.rnn_wrappers.ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] return concat_cell
def conv_and_lstm(inputs, input_lengths, conv_layers, conv_width, conv_channels, lstm_units, is_training, scope): # Convolutional layers with tf.variable_scope(scope): x = inputs for i in range(conv_layers): activation = tf.nn.relu if i < conv_layers - 1 else None x = conv1d(x, conv_width, conv_channels, activation, is_training, 'conv_%d' % i) # 2-layer bidirectional LSTM: outputs, states = tf.nn.bidirectional_dynamic_rnn( LSTMBlockCell(lstm_units), LSTMBlockCell(lstm_units), x, sequence_length=input_lengths, dtype=tf.float32, scope='encoder_lstm') # Concatentate forward and backwards: return tf.concat(outputs, axis=2)
def BiLSTM_classifier(X, num_hidden, num_classes, seq_lens, fold, istate_fw=None, istate_bw=None, dtype=tf.float32): #X is of shape batch_size x seq_lens x num_feats #use variable scope to put the variable for each fold in a different namespace with tf.variable_scope(str(fold)): lstm_fw_cell = LSTMBlockCell(num_hidden) lstm_bw_cell = LSTMBlockCell(num_hidden) outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, X, sequence_length=seq_lens, dtype=tf.float32) #concatenate both forward and backward outputs fromt the BiLSTM #should consist of 1024 outputs if 512 hidden layer size X = tf.concat(outputs, axis=2) #take first and last of the 1024 outputs #need to find last based on seq lengths first = X[:, 0, :] #batchsize x 1 x 1024 #last = X[:,-1,:] #may not support; should be batchsize x 1 x 1024; last = last_relevant( X, seq_lens ) #from https://danijar.com/variable-sequence-lengths-in-tensorflow #print("Printing last") #print(last) #might need function like last_relevant X = tf.concat([first, last], axis=1) #batchsize x 1 x 2048 weights, biases = weight_and_bias(X.get_shape().as_list()[1], num_classes) #set the below to work for each element of the batch prediction = tf.matmul(X, weights) + biases return prediction
def conv_and_lstm(inputs, input_lengths, conv_layers, conv_width, conv_channels, lstm_units_unidirectional, is_training, scope): with tf.variable_scope(scope): # convolutional layers convolved_inputs = inputs for i in range(conv_layers): activation = tf.nn.relu if i < conv_layers - 1 else None convolved_inputs = conv1d(convolved_inputs, conv_width, conv_channels, activation, is_training, 'conv_{}'.format(i)) # bidirectional LSTM outputs, states = tf.nn.bidirectional_dynamic_rnn( LSTMBlockCell(lstm_units_unidirectional), LSTMBlockCell(lstm_units_unidirectional), convolved_inputs, sequence_length=input_lengths, dtype=tf.float32, scope='{}_lstm'.format(scope)) # concatenate forwards and backwards return tf.concat(outputs, axis=2)
def make_cell(num_units, residual): if self.rnn_type == 'gru': print("GRU") cell = GRUCell(num_units) else: if self.layer_norm: print("LSTM With layer norm") cell = LayerNormBasicLSTMCell(num_units, layer_norm=True) else: print("LSTM Without layer norm") #cell = LSTMCell(num_units) cell = LSTMBlockCell(num_units) if residual: cell = ResidualWrapper(cell) return cell
def get_cell(cell_type, size, layers=1, direction='unidirectional'): if cell_type == "layer_norm_basic": cell = LayerNormBasicLSTMCell(size) elif cell_type == "lstm_block_fused": cell = tf.contrib.rnn.LSTMBlockFusedCell(size) elif cell_type == "cudnn_lstm": cell = CudnnLSTM(layers, size, direction=direction) elif cell_type == "cudnn_gru": cell = CudnnGRU(layers, size, direction=direction) elif cell_type == "lstm_block": cell = LSTMBlockCell(size) elif cell_type == "gru_block": cell = GRUBlockCell(size) elif cell_type == "rnn": cell = BasicRNNCell(size) elif cell_type == "cudnn_rnn": cell = CudnnRNNTanh(layers, size) else: cell = BasicLSTMCell(size) return cell
def build(self): self.lstm_cell = LSTMBlockCell( self.units, #use_peepholes=self.peephole, use_peephole=True) #initializer=tf.initializers.random_uniform(minval=self.minval, # maxval=self.maxval)) self.va_lstm_cell = DropoutWrapper(self.lstm_cell, variational_recurrent=True, input_keep_prob=0.7, output_keep_prob=0.7, state_keep_prob=0.7, dtype=tf.float32, input_size=self.inputSize) tf.nn.dynamic_rnn(self.va_lstm_cell, tf.random_normal((1, 1, self.inputSize)), dtype=tf.float32) self._trainable_weights = self.lstm_cell.variables
def get_rnn_cell_list(config, name, reuse=False, seed=123, dtype=tf.float32): cell_list = [] for i, units in enumerate(config['num_units']): cell = None if config['cell_type'] == 'clstm': cell = CustomLSTMCell(units, layer_norm=config['layer_norm'], activation=config['activation'], seed=seed, reuse=reuse, dtype=dtype, name='{}_{}'.format(name, i)) elif config['cell_type'] == 'tflstm': act = get_activation(config['activation']) if config['layer_norm']: cell = LayerNormBasicLSTMCell(num_units=units, activation=act, layer_norm=config['layer_norm'], reuse=reuse) elif config['layer_norm'] == False and config['activation'] != 'tanh': cell = LSTMCell(num_units=units, activation=act, reuse=reuse) else: cell = LSTMBlockCell(num_units=units) cell_list.append(cell) return cell_list
def build_model(self): self.X = tf.placeholder( tf.float32, [None, self.config.sequence_length, self.config.num_inputs]) self.Y = tf.placeholder(tf.float32, [None, self.config.num_outputs]) # x = tf.unstack(self.X, self.config.sequence_length, axis=1) dense1 = tf.layers.dense(self.X, self.config.E, activation=tf.math.tanh) m = tf.unstack(dense1, self.config.sequence_length, axis=1) attentive_lstm = AttentionCellWrapper( LSTMBlockCell(self.config.hidden_units), self.config.sequence_length) rnn_outputs, rnn_states = tf.contrib.rnn.static_rnn(attentive_lstm, m, dtype=tf.float32) dense2 = tf.layers.dense(rnn_outputs[-1], self.config.num_outputs) self.prediction = dense2 with tf.name_scope("loss"): self.l2_regularization = self.config.lambda_l2_reg * sum( tf.nn.l2_loss(v) for v in tf.trainable_variables() if not ("Bias" or "bias" in v.name)) self.mean_squared_error = tf.losses.mean_squared_error( self.Y, self.prediction) # self.loss = tf.losses.log_loss(self.Y, self.prediction) self.loss = self.mean_squared_error + self.l2_regularization update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_step = tf.train.AdamOptimizer( self.config.learning_rate).minimize( self.loss, global_step=self.global_step_tensor)
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None self.batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 512] # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) attention_cell = AttentionWrapper( DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 128] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ concat_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units) ], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry [N, T_out, M] decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels]) # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' expand out: %d' % expand_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Global style tokens (GST), When using h attention heads, we set # the token embedding size to be 256/h and concatenate the attention # outputs of each head. gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: # Reference encoder reference_embedding = reference_encoder( mel_targets, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), is_training=is_training) # Style token layer style_embedding = multi_head_attention( num_heads=hp.num_heads, queries=tf.expand_dims(reference_embedding, axis=1), # [N, 1, 128] memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256 // hp.num_heads] num_units=128) else: # TODO Add support for reference mode and more effective style control during inference. # Randomly select style embedding from gst_tokens for simplicity. random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32) style_embedding = tf.nn.embedding_lookup(gst_tokens, random_index) # Add style embedding to every text encoder state, applying tanh to # compress both encoder state and style embedding to the same scale. encoder_outputs += tf.nn.tanh(style_embedding) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) if is_training: (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state)) else: (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets tf.logging.info('Initialized Tacotron model. Dimensions: ') tf.logging.info(' embedding: %d' % embedded_inputs.shape[-1]) tf.logging.info(' prenet out: %d' % prenet_outputs.shape[-1]) tf.logging.info(' encoder out: %d' % encoder_outputs.shape[-1]) tf.logging.info(' attention out: %d' % attention_cell.output_size) tf.logging.info(' concat attn & out: %d' % concat_cell.output_size) tf.logging.info(' decoder cell out: %d' % decoder_cell.output_size) tf.logging.info(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) tf.logging.info(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) tf.logging.info(' postnet out: %d' % post_outputs.shape[-1]) tf.logging.info(' linear out: %d' % linear_outputs.shape[-1])
def main(): args = parser.parse_args() input_size = 10 * args.total_digits output_size = 11 * (args.total_digits + 1) batch_size = args.batch_size hidden_size = args.hidden_size use_act = args.use_act # Placeholders for inputs. x = tf.placeholder(tf.float32, [batch_size, args.sequence_length, input_size]) y = tf.placeholder(tf.int64, [batch_size*(args.sequence_length-1)*(args.total_digits+1)]) rnn = LSTMBlockCell(hidden_size) if use_act: inputs = [tf.squeeze(xx) for xx in tf.split(x, args.sequence_length, 1)] act = ACTCell(num_units=args.hidden_size, cell=rnn, max_computation=20, batch_size=batch_size, state_is_tuple=True, return_ponders=args.return_ponders) outputs, final_state = static_rnn(act, inputs, dtype=tf.float32, initial_state=act.zero_state(args.batch_size, tf.float32)) outputs = tf.stack(outputs, 1) else: outputs, final_state = tf.nn.dynamic_rnn(rnn, x, dtype=tf.float32) output = tf.reshape(outputs[:, 1:, :], [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, output_size]) softmax_b = tf.get_variable("softmax_b", [output_size]) logits = tf.reshape(tf.matmul(output, softmax_w) + softmax_b, [-1, 11]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(tf.reshape(loss, [batch_size, -1]), axis=1) if use_act: if args.return_ponders: ponder, ponders_tensor = act.calculate_ponder_cost() ponders_tensor = tf.reduce_mean(ponders_tensor, axis=0) else: ponder = act.calculate_ponder_cost() ponder_mean = tf.reduce_mean(ponder) tf.summary.scalar('Ponder', ponder_mean) loss += args.tau*ponder loss = tf.reduce_mean(loss) tf.summary.scalar('Loss', loss) train_step = tf.train.AdamOptimizer(args.lr).minimize(loss) predicted = tf.argmax(logits, 1) target = tf.cast(y, tf.int64) correct_sequences = tf.cast(tf.reduce_all(tf.reshape(tf.equal(predicted, target), [args.batch_size, (args.sequence_length-1)*(args.total_digits+1)]), axis=1), tf.float32) accuracy = tf.reduce_mean(correct_sequences) tf.summary.scalar('Accuracy', accuracy) merged = tf.summary.merge_all() logdir = './logs/addition/LR={}'.format(args.lr) if args.use_act: logdir += '_Tau={}'.format(args.tau) else: logdir += '_NoACT' while os.path.isdir(logdir): logdir += '_' if args.log: writer = tf.summary.FileWriter(logdir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.vram_fraction) if args.return_ponders: ponders_list = list() with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) loop = trange(args.steps) for i in loop: batch = generate(args) if i % args.log_interval == 0: if use_act: if args.return_ponders: summary, step_accuracy, step_loss, step_ponder, step_ponders_tensor \ = sess.run([merged, accuracy, loss, ponder_mean, ponders_tensor], feed_dict={x: batch[0], y: batch[1]}) ponders_list.append(step_ponders_tensor) stack = np.stack(ponders_list, axis=0) np.savetxt('ponders_addition.txt', stack) else: summary, step_accuracy, step_loss, step_ponder \ = sess.run([merged, accuracy, loss, ponder_mean], feed_dict={x: batch[0], y: batch[1]}) if args.print_results: loop.set_postfix(Loss='{:0.3f}'.format(step_loss), Accuracy='{:0.3f}'.format(step_accuracy), Ponder='{:0.3f}'.format(step_ponder)) else: summary, step_accuracy, step_loss = sess.run([merged, accuracy, loss], feed_dict={ x: batch[0], y: batch[1]}) if args.print_results: loop.set_postfix(Loss='{:0.3f}'.format(step_loss), Accuracy='{:0.3f}'.format(step_accuracy)) if args.log: writer.add_summary(summary, i) train_step.run(feed_dict={x: batch[0], y: batch[1]}) if args.return_ponders: stack = np.stack(ponders_list, axis=0) np.savetxt('ponders_addition.txt', stack)
def main(): args = parser.parse_args() input_size = 10 * args.total_digits output_size = 11 * (args.total_digits + 1) batch_size = args.batch_size hidden_size = args.hidden_size # Placeholders for inputs. x = tf.placeholder( tf.float32, [batch_size, args.ponder * args.sequence_length, 1 + input_size]) y = tf.placeholder( tf.int64, [batch_size * (args.sequence_length - 1) * (args.total_digits + 1)]) rnn = LSTMBlockCell(hidden_size) output, final_state = tf.nn.dynamic_rnn(rnn, x, dtype=tf.float32) output = output[:, args.ponder - 1::args.ponder, :] output = tf.reshape(output[:, 1:, :], [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, output_size]) softmax_b = tf.get_variable("softmax_b", [output_size]) logits = tf.reshape(tf.matmul(output, softmax_w) + softmax_b, [-1, 11]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(tf.reshape(loss, [batch_size, -1]), axis=1) loss = tf.reduce_mean(loss) tf.summary.scalar('Loss', loss) train_step = tf.train.AdamOptimizer(args.lr).minimize(loss) predicted = tf.argmax(logits, 1) target = tf.cast(y, tf.int64) correct_sequences = tf.cast( tf.reduce_all(tf.reshape(tf.equal(predicted, target), [ args.batch_size, (args.sequence_length - 1) * (args.total_digits + 1) ]), axis=1), tf.float32) accuracy = tf.reduce_mean(correct_sequences) tf.summary.scalar('Accuracy', accuracy) merged = tf.summary.merge_all() logdir = './logs/addition_test/LR={}_Pond={}'.format(args.lr, args.ponder) while os.path.isdir(logdir): logdir += '_' if args.log: writer = tf.summary.FileWriter(logdir) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.vram_fraction) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: sess.run(tf.global_variables_initializer()) loop = trange(args.steps) for i in loop: batch = generate(args) if i % args.log_interval == 0: summary, step_accuracy, step_loss = sess.run( [merged, accuracy, loss], feed_dict={ x: batch[0], y: batch[1] }) if args.print_results: loop.set_postfix(Loss='{:0.3f}'.format(step_loss), Accuracy='{:0.3f}'.format(step_accuracy)) if args.log: writer.add_summary(summary, i) train_step.run(feed_dict={x: batch[0], y: batch[1]})
def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. speaker_ids: int32 Tensor containing ids of specific speakers mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference'): is_training = linear_targets is not None batch_size = tf.shape(text_inputs)[0] hp = self._hparams vocab_size = len(symbols) embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim) # [N, T_in, embd_size] # extract speaker embedding if multi-speaker with tf.variable_scope('speaker'): if hp.num_speakers > 1: speaker_embedding = tf.get_variable('speaker_embed', shape=(hp.num_speakers, hp.speaker_embed_dim), dtype=tf.float32) # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)? speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids) else: speaker_embd = None # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention Mechanism attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training, speaker_embd=speaker_embd, attention_type="location_sensitive") # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ attention_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units)], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: # TODO: seems not to work?!? alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = text_inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' concat attn & out: %d' % attention_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % postnet_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_lstm( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = AttentionWrapper( # [N, T_in, attention_depth=256] DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training, hp.prenet_depths), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ concat_cell, LSTMBlockCell(hp.decoder_gru_units), LSTMBlockCell(hp.decoder_gru_units) ], state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and LSTM layer: expand_outputs = conv_and_lstm( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format(pml_outputs.shape[-1]))