def _decode(self, z, helper, max_length=None, x_input=None): """Decodes the given batch of latent vectors vectors, which may be 0-length. Args: z: Batch of latent vectors, sized `[batch_size, z_size]`, where `z_size` may be 0 for unconditioned decoding. helper: A seq2seq.Helper to use. If a TrainingHelper is passed and a CudnnLSTM has previously been defined, it will be used instead. max_length: (Optinal) The maximum iterations to decode. x_input: (Optional) The inputs to the decoder for teacher forcing. Required if CudnnLSTM is to be used. Returns: final_output: The final seq2seq.BasicDecoderOutput. """ initial_state = initial_cell_state_from_embedding( self._dec_cell, z, name='decoder/z_to_initial_state') # CudnnLSTM does not support sampling so it can only replace TrainingHelper. if self._cudnn_dec_lstm and type(helper) == seq2seq.TrainingHelper: # pylint:disable=unidiomatic-typecheck rnn_output, _ = self._cudnn_dec_lstm( tf.transpose(x_input, [1, 0, 2]), initial_state=_cudnn_lstm_state(initial_state), training=self._is_training) with tf.variable_scope('decoder'): rnn_output = self._output_layer(rnn_output) final_output = seq2seq.BasicDecoderOutput( rnn_output=tf.transpose(rnn_output, [1, 0, 2]), sample_id=None) else: if self._cudnn_dec_lstm: tf.logging.warning( 'CudnnLSTM does not support sampling. Using `dynamic_decode` ' 'instead.') decoder = seq2seq.BasicDecoder( self._dec_cell, helper, initial_state=initial_state, output_layer=self._output_layer) final_output, _, _ = seq2seq.dynamic_decode( decoder, maximum_iterations=max_length, swap_memory=True, scope='decoder') return final_output
def build_graph(self): print('Building the TensorFlow graph...') opts = self.options self.graph = tf.Graph() with self.graph.as_default(): self.enc_input = tf.placeholder( tf.int32, shape=[opts.max_hist_len, opts.batch_size, opts.max_uttr_len]) self.enc_input_e = tf.placeholder( tf.float32, shape=[opts.batch_size, opts.max_hist_len, opts.n_emot]) self.dec_input = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1]) self.target = tf.placeholder( tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1]) self.enc_input_len = tf.placeholder( tf.int32, shape=[opts.max_hist_len, opts.batch_size]) self.dec_input_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) self.hist_len = tf.placeholder(tf.int32, shape=[opts.batch_size]) with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): # word_embeddings = tf.Variable(tf.random_uniform([opts.vocab_size, opts.word_embed_size], -1.0, 1.0), # name = 'word_embeddings') word_embeddings = tf.Variable(opts.word_embeddings, name='word_embeddings') enc_input_embed = tf.nn.embedding_lookup( word_embeddings, self.enc_input) dec_input_embed = tf.nn.embedding_lookup( word_embeddings, self.dec_input) with tf.variable_scope('word_level_encoding', reuse=tf.AUTO_REUSE): outputs_enc = [] cell_fw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s) cell_bw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s) for i in range(opts.max_hist_len): outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=enc_input_embed[i, :, :, :], sequence_length=self.enc_input_len[i, :], dtype=tf.float32) outputs_enc.append(tf.concat(outputs, 2)) outputs_enc = tf.stack(outputs_enc) with tf.variable_scope('emotion_encoding', reuse=tf.AUTO_REUSE): emot_input_layer = tf.layers.Dense( opts.emot_input_layer_size, activation=tf.sigmoid, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1), name='emot_input_layer') enc_input_e = emot_input_layer(self.enc_input_e) cell_emot = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_e) _, final_state = tf.nn.dynamic_rnn( cell_emot, inputs=enc_input_e, sequence_length=self.hist_len, dtype=tf.float32) emot_vector = final_state * opts.beta if opts.mode == 'PREDICT': outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3]) outputs_enc = tile_batch(outputs_enc, multiplier=opts.beam_width) outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3]) tiled_enc_input_len = tile_batch(tf.transpose( self.enc_input_len), multiplier=opts.beam_width) tiled_enc_input_len = tf.transpose(tiled_enc_input_len) tiled_hist_len = tile_batch(self.hist_len, multiplier=opts.beam_width) tiled_emot_vector = tile_batch(emot_vector, multiplier=opts.beam_width) else: tiled_enc_input_len = self.enc_input_len tiled_hist_len = self.hist_len tiled_emot_vector = emot_vector with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs: attn_mechanism = UttrLevelAttentionMechanism( word_level_num_units=opts.word_level_attn_depth, uttr_level_num_units=opts.uttr_level_attn_depth, n_hidden_units=opts.n_hidden_units_enc_s, memory=outputs_enc, memory_sequence_length=tiled_enc_input_len, hist_length=tiled_hist_len) cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec) cell_dec = MyAttentionWrapper(cell_dec, attn_mechanism, tiled_emot_vector) output_layer = tf.layers.Dense( units=opts.vocab_size - 1, kernel_initializer=tf.truncated_normal_initializer( stddev=0.1), name='output_layer') # Train if opts.mode == 'TRAIN': outputs_dec, _ = tf.nn.dynamic_rnn( cell=cell_dec, inputs=dec_input_embed, sequence_length=self.dec_input_len, initial_state=cell_dec.zero_state( opts.batch_size, tf.float32), dtype=tf.float32, scope=vs) logits = output_layer.apply(outputs_dec) weights = tf.sequence_mask(self.dec_input_len, maxlen=opts.max_uttr_len + 1, dtype=tf.float32) self.loss = sequence_loss(logits, self.target, weights) self.loss_batch = sequence_loss(logits, self.target, weights, average_across_batch=False) self.optimizer = tf.train.AdamOptimizer( opts.learning_rate).minimize(self.loss) self.init = tf.global_variables_initializer() # Predict if opts.mode == 'PREDICT': start_tokens = tf.constant(opts.go_index, dtype=tf.int32, shape=[opts.batch_size]) bs_decoder = BeamSearchDecoder( cell=cell_dec, embedding=word_embeddings, start_tokens=start_tokens, end_token=opts.eos_index, initial_state=cell_dec.zero_state( opts.batch_size * opts.beam_width, tf.float32), beam_width=opts.beam_width, output_layer=output_layer) final_outputs, final_state, _ = dynamic_decode( bs_decoder, impute_finished=False, maximum_iterations=opts.max_uttr_len + 1, scope=vs) self.predicted_ids = final_outputs.predicted_ids self.scores = final_outputs.beam_search_decoder_output.scores self.uttr_level_alignments = final_state[ 0].alignment_history_ul.stack() self.word_level_alignments = final_state[ 0].alignment_history_wl.stack() self.final_sequence_lengths = final_state[3] self.tvars = tf.trainable_variables() self.saver = tf.train.Saver(max_to_keep=100)
def _create_seq2seq(self): if self.core == "blstm": # Mutilayer BLSTM Encoder with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): for layer_i in range(self.encoder_layers): cell_fw, cell_bw = self._create_blstmcell(layer_i) (self.encoder_inputs_embedded, self.encoder_final_state) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.encoder_inputs_embedded, dtype=tf.float32) self.encoder_inputs_embedded = tf.add_n(self.encoder_inputs_embedded) if self.is_train == 0: self.encoder_inputs_embedded = tf.multiply(self.encoder_inputs_embedded, self.keep_prob) self.encoder_final_state_c = tf.concat( (self.encoder_final_state[0].c, self.encoder_final_state[1].c), 1) self.encoder_final_state_h = tf.concat( (self.encoder_final_state[0].h, self.encoder_final_state[1].h), 1) self.encoder_final_state = contrib.rnn.LSTMStateTuple( c=self.encoder_final_state_c, h=self.encoder_final_state_h) # Basic Attention based LSTM Decoder(train and infer) with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): self.decoder_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.decoder_hidden_units, state_is_tuple=True) self.attention_state = self.encoder_inputs_embedded self.attention_mechanism = contrib.seq2seq.LuongAttention(num_units=self.decoder_hidden_units, memory=self.attention_state, memory_sequence_length=self.encoder_length) self.attn_cell = contrib.seq2seq.AttentionWrapper(cell=self.decoder_cell, attention_mechanism=self.attention_mechanism, name="decoder_attention_cell", alignment_history=False ) self.fc_layer = tf.layers.Dense(self.vocab_size, name='dense_layer') # for train with tf.variable_scope('decoder_train', reuse=tf.AUTO_REUSE): self.helper_train = contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_length) self.decoder_initial_state = self.attn_cell.zero_state(self.batch_size, dtype=tf.float32).clone( cell_state=self.encoder_final_state) self.decoder_train = contrib.seq2seq.BasicDecoder(cell=self.attn_cell, initial_state=self.decoder_initial_state, helper=self.helper_train, output_layer=self.fc_layer ) self.decoder_train_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_train ) # for infer with tf.variable_scope('decoder_infer', reuse=tf.AUTO_REUSE): self.start_tokens = tf.tile([19654], [self.batch_size]) self.end_tokens = 19655 self.helper_infer = contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embeddings_trainable, start_tokens=self.start_tokens, end_token=self.end_tokens) self.decoder_infer = contrib.seq2seq.BasicDecoder(cell=self.attn_cell, initial_state=self.decoder_initial_state, helper=self.helper_infer, output_layer=self.fc_layer) self.decoder_infer_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_infer, maximum_iterations=20 ) elif self.core == "bgru": # single layer bgru encoder with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): inputs = self.encoder_inputs_embedded cell_fw, cell_bw = self._create_bgrucell() with tf.variable_scope(None, default_name="encoder"): (output, self.encoder_final_state) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=inputs, dtype=tf.float32) self.encoder_final_state = tf.concat(self.encoder_final_state, 1) # basic gru Decoder for train and infer with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): self.decoder_cell = tf.nn.rnn_cell.GRUCell(num_units=self.decoder_hidden_units, name='decoder_cell') self.attention_state = self.encoder_inputs_embedded self.attention_mechanism = contrib.seq2seq.LuongAttention(num_units=self.decoder_hidden_units, memory=self.attention_state, memory_sequence_length=self.encoder_length) self.attn_cell = contrib.seq2seq.AttentionWrapper(cell=self.decoder_cell, attention_mechanism=self.attention_mechanism, name="decoder_attention_cell", alignment_history=False ) self.fc_layer = tf.layers.Dense(self.vocab_size, name='dense_layer') with tf.variable_scope('decoder_train', reuse=tf.AUTO_REUSE): # for train self.helper_train = contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_length) self.decoder_initial_state = self.attn_cell.zero_state(self.batch_size, dtype=tf.float32).clone( cell_state=self.encoder_final_state) self.decoder_train = contrib.seq2seq.BasicDecoder(cell=self.attn_cell, initial_state=self.decoder_initial_state, helper=self.helper_train, output_layer=self.fc_layer ) self.decoder_train_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_train ) with tf.variable_scope('decoder_infer', reuse=tf.AUTO_REUSE): # for infer self.start_tokens = tf.fill([self.batch_size], 19654) self.end_tokens = 19655 self.helper_infer = contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embeddings_trainable, start_tokens=self.start_tokens, end_token=self.end_tokens) self.decoder_infer = contrib.seq2seq.BasicDecoder(cell=self.attn_cell, initial_state=self.decoder_initial_state, helper=self.helper_infer, output_layer=self.fc_layer) self.decoder_infer_logits, _, _ = s2s.dynamic_decode(self.decoder_infer, maximum_iterations=20 )
def build_predict_decoder(self): # start_tokens: [batch_size,] start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.start_token end_token = self.end_token if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: use the argmax of the output if self.predict_mode == 'sample': print('Building sample decoder...') decoding_helper = seq2seq.SampleEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs)) elif self.predict_mode == 'greedy': print('Building greedy decoder...') decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs)) else: raise NotImplementedError( 'Predict mode: {} is not yet implemented'.format( self.predict_mode)) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) else: raise NotImplementedError( 'Beamsearch decode is not yet implemented.') self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=self.max_decode_step) if not self.use_beamsearch_decode: self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: raise NotImplementedError('{} mode is not recognized.'.format( self.mode))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.all_vars = tf.trainable_variables() if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape)) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1_000_000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if gta == False and self._hparams.predict_linear == True and linear_targets is None: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): hp = self._hparams batch_size = tf.shape(inputs)[0] gta = False T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose self.enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = GMMAttention(self.encoder_outputs, input_lengths, is_training) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, num_attn_mixture=5) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] self.decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) self.stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: self.decoder_output = tf.minimum( tf.maximum(self.decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(self.decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') self.projected_residual = residual_projection(residual) #Compute the mel spectrogram self.mel_outputs = self.decoder_output + self.projected_residual if hp.clip_outputs: self.mel_outputs = tf.minimum( tf.maximum(self.mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] self.post_outputs = post_cbhg(self.mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] self.linear_outputs = linear_specs_projection(self.post_outputs) if hp.clip_outputs: self.linear_outputs = tf.minimum( tf.maximum(self.linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state self.alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) log('initialisation done.') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.mel_targets = mel_targets self.linear_targets = linear_targets self.targets_lengths = targets_lengths self.stop_token_targets = stop_token_targets self.gta = gta self.all_vars = tf.trainable_variables() self.is_training = is_training self.is_evaluating = is_evaluating log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format( self.enc_conv_output_shape)) log(' encoder out: {}'.format( self.encoder_outputs.shape)) log(' decoder out: {}'.format(self.decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( self.projected_residual.shape)) log(' mel out: {}'.format(self.mel_outputs.shape)) log(' linear out: {}'.format(self.linear_outputs.shape)) log(' <stop_token> out: {}'.format( self.stop_token_prediction.shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def __init__(self, vocab_size, hidden_size, dropout, num_layers, max_gradient_norm, batch_size, learning_rate, lr_decay_factor, max_target_length, max_source_length, decoder_mode=False): ''' vocab_size: number of vocab tokens buckets: buckets of max sequence lengths hidden_size: dimension of hidden layers num_layers: number of hidden layers max_gradient_norm: maximum gradient magnitude batch_size: number of training examples fed to network at once learning_rate: starting learning rate of network lr_decay_factor: amount by which to decay learning rate num_samples: number of samples for sampled softmax decoder_mode: Whether to build backpass nodes or not ''' GO_ID = config.GO_ID EOS_ID = config.EOS_ID self.max_source_length = max_source_length self.max_target_length = max_target_length self.vocab_size = vocab_size self.batch_size = batch_size self.global_step = tf.Variable(0, trainable=False) self.learning_rate = learning_rate self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths') self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths") with tf.variable_scope('embeddings') as scope: embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs) targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets) with tf.variable_scope('encoder') as scope: encoder_cell = rnn.LSTMCell(hidden_size) encoder_cell = rnn.DropoutWrapper(encoder_cell, input_keep_prob=dropout) encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers) _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, sequence_length=self.source_lengths, inputs=encoder_inputs_embedded, dtype=tf.float32, time_major=False) with tf.variable_scope('decoder') as scope: decoder_cell = rnn.LSTMCell(hidden_size) decoder_cell = rnn.DropoutWrapper(decoder_cell, input_keep_prob=dropout) decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers) #TODO add attention #seq2seq.BahdanauAttention(num_units=,memory=encoder_output) #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell, # attention_mechanism=) if decoder_mode: decoder = seq2seq.BeamSearchDecoder(embedding=embeddings, start_tokens=tf.tile([GOD_ID], [batch_size]), end_token=EOS_ID, initial_state=encoder_state[0], beam_width=2) else: helper = seq2seq.TrainingHelper(inputs=targets_embedding, sequence_length=self.target_lengths) decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=encoder_state[-1], output_layer=Dense(vocab_size)) final_outputs, final_state, final_sequence_lengths =\ seq2seq.dynamic_decode(decoder=decoder) self.logits = final_outputs.rnn_output if not decoder_mode: with tf.variable_scope("loss") as scope: #have to pad logits, dynamic decode produces results not consistent #in shape with targets pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths) self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]]) weights = tf.sequence_mask(lengths=final_sequence_lengths, maxlen=self.max_target_length, dtype=tf.float32, name='weights') x_entropy_loss = seq2seq.sequence_loss(logits=self.logits, targets=self.decoder_targets, weights=weights) self.loss = tf.reduce_mean(x_entropy_loss) optimizer = tf.train.AdamOptimizer() gradients = optimizer.compute_gradients(x_entropy_loss) capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_grads, global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def buildModel(self): T_in = self.args.T_in T_out = self.args.T_out D_in = self.args.D_in D_out = self.args.D_out E = self.args.embedding_dim H = self.args.hidden_dim SOS = self.args.SOS EOS = self.args.EOS PAD = self.args.PAD beam_width = 3 # Input with tf.name_scope('input'): x = tf.placeholder(shape=(None, T_in), dtype=tf.int32, name='encoder_inputs') # N, T_out y = tf.placeholder(shape=(None, T_out), dtype=tf.int32, name='decoder_inputs') # N x_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # N y_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # dynamic sample num batch_size = tf.shape(x)[0] # symbol mask sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD # input mask x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32) y_with_sos_mask = tf.sequence_mask(y_len, T_out + 1, dtype=tf.float32) y_with_pad = tf.concat([y, pad], axis=1) eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS # masked inputs y_with_eos = y_with_pad + eos_mask y_with_sos = tf.concat([sos, y], axis=1) ## Embedding with tf.name_scope('embedding'): if self.args.use_pretrained: embedding_pretrained = np.fromfile(self.args.pretrained_file, dtype=np.float32).reshape( (-1, E)) embedding = tf.Variable(embedding_pretrained, trainable=False) else: embedding = tf.get_variable(name='embedding', shape=(D_in, E), dtype=tf.float32, initializer=xavier_initializer()) e_x = tf.nn.embedding_lookup(embedding, x) e_y = tf.nn.embedding_lookup(embedding, y_with_sos) if self.args.mode == 'train': e_x = tf.nn.dropout(e_x, self.args.keep_prob) ## Encoder with tf.name_scope('encoder'): ## Multi-BiLSTM fw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, e_x, sequence_length=x_len, dtype=tf.float32, time_major=False, scope=None) encoder_output = bi_encoder_output[0] + bi_encoder_output[1] encoder_final_state = bi_encoder_state[0] ## Decoder with tf.name_scope('decoder'): decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) decoder_lengths = tf.ones(shape=[batch_size], dtype=tf.int32) * (T_out + 1) ## Trainning decoder with tf.variable_scope('attention'): attention_mechanism = LuongAttention( num_units=H, memory=encoder_output, memory_sequence_length=x_len, name='attention_fn') projection_layer = Dense(units=D_out, kernel_initializer=xavier_initializer()) train_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=H) train_decoder_init_state = train_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) training_helper = TrainingHelper(e_y, decoder_lengths, time_major=False) train_decoder = BasicDecoder( cell=train_decoder_cell, helper=training_helper, initial_state=train_decoder_init_state, output_layer=projection_layer) train_decoder_outputs, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=T_out + 1) # N, T_out+1, D_out train_decoder_outputs = ln(train_decoder_outputs.rnn_output) ## Beam_search decoder beam_memory = tile_batch(encoder_output, beam_width) beam_memory_state = tile_batch(encoder_final_state, beam_width) beam_memory_length = tile_batch(x_len, beam_width) with tf.variable_scope('attention', reuse=True): beam_attention_mechanism = LuongAttention( num_units=H, memory=beam_memory, memory_sequence_length=beam_memory_length, name='attention_fn') beam_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=beam_attention_mechanism, attention_layer_size=None) beam_decoder_init_state = beam_decoder_cell.zero_state( batch_size=batch_size * beam_width, dtype=tf.float32).clone(cell_state=beam_memory_state) start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS beam_decoder = BeamSearchDecoder( cell=beam_decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=EOS, initial_state=beam_decoder_init_state, beam_width=beam_width, output_layer=projection_layer) beam_decoder_outputs, _, _ = dynamic_decode( beam_decoder, scope=tf.get_variable_scope(), maximum_iterations=T_out + 1) beam_decoder_result_ids = beam_decoder_outputs.predicted_ids with tf.name_scope('loss'): logits = tf.nn.softmax(train_decoder_outputs) cross_entropy = tf.keras.losses.sparse_categorical_crossentropy( y_with_eos, logits) loss_mask = tf.sequence_mask(y_len + 1, T_out + 1, dtype=tf.float32) loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast( batch_size, dtype=tf.float32) prediction = tf.argmax(logits, 2) ## train_op with tf.name_scope('train'): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) ## gradient clips trainable_params = tf.trainable_variables() gradients = tf.gradients(loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.args.gradient_clip_num) train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params), global_step=global_step) # Summary with tf.name_scope('summary'): tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
def build_decoder(self): print 'Building Decoder' with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3), dtype=tf.float32) self.decoder_embeddings = tf.get_variable( "decoder_embeddings", [self.tgt_vocab_size, self.input_embedding_size], initializer=initializer, dtype=tf.float32) input_layer = Dense(self.decoder_hidden_units, dtype=tf.float32, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.tgt_vocab_size, name='output_projection') if self.mode == 'train': # Train Mode self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) (self.decoder_output_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # [batch_size, max_time_step + 1, num_decoder_symbols] self.decoder_logits_train = tf.identity( self.decoder_output_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=1, name='decoder_pred_train') # [batch_size, max_time_steps + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=tf.float32, name='masks') self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Decode mode start_tokens = tf.ones([ self.batch_size, ], tf.int32) * start_token # end_token = end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) # Feeds input for greedy decoding: uses argmax for the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=self.max_decode_step)) # To be compatible in case of use of beam search # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1)
def build_model(self): """ build model :return: """ with tf.variable_scope('g_model'): # 1 定义模型的placeholder # encoder self.encoder_inputs = tf.placeholder( tf.int32, [self.max_length_encoder, None], name='encoder_inputs') self.encoder_inputs_length = tf.placeholder( tf.int32, [None], name='encoder_inputs_length') # decoder self.decoder_targets = tf.placeholder( tf.int32, [self.max_length_decoder, None], name='decoder_targets') self.decoder_targets_length = tf.placeholder( tf.int32, [None], name='decoder_targets_length') self.max_target_sequence_length = tf.reduce_max( self.decoder_targets_length, name='max_target_len') self.mask = tf.sequence_mask(self.decoder_targets_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') # for updating self.reward = tf.placeholder(tf.float32, [self.max_length_decoder, None], name='reward') self.start_tokens = tf.placeholder( tf.int32, [None], name='start_tokens') # for partial-sampling self.max_inference_length = tf.placeholder( tf.int32, [None], name='max_inference_length') # for inference # 2 定义模型的encoder部分 with tf.variable_scope('encoder'): encoder_cell = self.create_rnn_cell() embedding = tf.get_variable( 'embedding', [self.vocab_size, self.embedding_size]) encoder_inputs_embedded = tf.nn.embedding_lookup( embedding, self.encoder_inputs) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, encoder_inputs_embedded, sequence_length=self.encoder_inputs_length, dtype=tf.float32) # 3 定义模型的decoder部分 with tf.variable_scope('decoder'): encoder_inputs_length = self.encoder_inputs_length # 定义要使用的attention机制 attention_mechanism = seq2seq.BahdanauAttention( num_units=self.lstm_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) decoder_cell = self.create_rnn_cell() decoder_cell = seq2seq.AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.lstm_size, name='Attention_Wrapper') # 定义decoder阶段的初始状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state( batch_size=self.batch_size, dtype=tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_inputs = tf.concat([ tf.fill([self.batch_size, 1], tf.cast( GO_ID, dtype=tf.int32)), ending ], 1) decoder_inputs_embedded = tf.nn.embedding_lookup( embedding, decoder_inputs) # train helper_train = seq2seq.TrainingHelper( decoder_inputs_embedded, self.decoder_targets_length, time_major=True) decoder_train = seq2seq.BasicDecoder(decoder_cell, helper_train, decoder_initial_state, output_layer=output_layer) decoder_output_train, decoder_state_train, _ = seq2seq.dynamic_decode( decoder_train, swap_memory=True, output_time_major=True, impute_finished=True, maximum_iterations=self.decoder_targets_length) self.decoder_logits_train = tf.identity( decoder_output_train.rnn_output) self.decoder_predict_train = tf.argmax( self.decoder_logits_train, axis=-1, name='decoder_pred_train') self.loss_pretrain = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.mask) tf.summary.scalar('loss', self.loss_pretrain) self.summary_op = tf.summary.merge_all()
def build_decoder(self, decoder_init_embed): print("building attention and decoder...") with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state, self.beam_decoder_cell, self.beam_decoder_initial_state \ = self._build_decoder_cell() # initializer self.decoder_embeddings = tf.Variable(decoder_init_embed, name="decoder_embedding", dtype=self.dtype) self.decoder_vocab_size = len(decoder_init_embed) input_layer = Dense(self.hidden_units, dtype=self.dtype, name="input_projection") output_layer = Dense(decoder_init_embed.shape[0], name="output_projection") # generate_mode decoder_start_tokens = tf.ones(shape=[self.batch_size, ], dtype=tf.int32) * params.start_token decoder_end_token = params.end_token def embed_and_input_proj(inputs): return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) print('greedy decoding...') generate_decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=decoder_start_tokens, \ end_token=decoder_end_token, \ embedding=embed_and_input_proj) generate_inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=generate_decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) with tf.variable_scope('decode_with_shared_attention'): self.gen_outputs, decoder_last_state, gen_outputs_len = (seq2seq.dynamic_decode( decoder=generate_inference_decoder, \ output_time_major=False, \ maximum_iterations=self.max_sent_len)) # params.max_decoder_len # self.gen_x: batch_size, max_decoder_len self.gen_x = self.gen_outputs.sample_id print("beam decoding...") beam_generate_inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.beam_decoder_cell, \ embedding=embed_and_input_proj, \ start_tokens=decoder_start_tokens, \ end_token=decoder_end_token, \ initial_state=self.beam_decoder_initial_state, \ beam_width=self.beam_width, \ output_layer=output_layer) with tf.variable_scope('decode_with_shared_attention', reuse=True): self.beam_gen_outputs, beam_decoder_last_state, beam_gen_outputs_len = (seq2seq.dynamic_decode( decoder=beam_generate_inference_decoder, \ output_time_major=False, \ maximum_iterations=self.max_sent_len)) # params.max_decoder_len self.beam_gen_x = self.beam_gen_outputs.predicted_ids print("decoder for rollout") # decoder inputs in train and rollout mode self.decoder_inputs_embedded = input_layer(tf.nn.embedding_lookup(params=self.decoder_embeddings, \ ids=self.decoder_inputs)) # rollout mode rollout_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=decoder_start_tokens, \ end_token=decoder_end_token, \ embedding=embed_and_input_proj) rollout_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, \ helper=rollout_helper, \ initial_state=self.decoder_initial_state, \ output_layer=output_layer) # calc samples for each time step (fix sent[:given_time+1], roll out sent[given_time+1:]) self.rollout_decoder_state = self.decoder_initial_state # rollout_outputs shape: max_decoder_len, batch_size rollout_outputs = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.max_sent_len, \ dynamic_size=False, infer_shape=True) init_inputs_embedded = embed_and_input_proj(decoder_start_tokens) i = tf.constant(0) while_condition = lambda i, inputs_embedded, decoder_state, rollout_outputs, given_time: tf.less(i, given_time) def feed_body(i, inputs_embedded, decoder_state, rollout_outputs, given_time): print("feed body iter:", i) next_outputs, decoder_state, next_inputs, decoder_finished = rollout_decoder.step(i, inputs_embedded, decoder_state) inputs = tf.reshape(tf.gather(params=self.decoder_inputs, indices=[i], axis=1), shape=[self.batch_size, ]) inputs_embedded = embed_and_input_proj(inputs) rollout_outputs = rollout_outputs.write(i, inputs) return i+1, inputs_embedded, decoder_state, rollout_outputs, given_time i, inputs_embedded, self.rollout_decoder_state, self.rollout_outputs, _ = tf.while_loop(while_condition, feed_body, \ (0, init_inputs_embedded, \ self.rollout_decoder_state, rollout_outputs, self.given_time)) # next_outputs shape: (batch_size, decoder_vocab_size) inputs = tf.reshape(tf.gather(params=self.decoder_inputs, indices=[self.given_time], axis=1), shape=[self.batch_size, ]) inputs_embedded = input_layer(tf.nn.embedding_lookup(params=self.decoder_embeddings, \ ids=inputs)) # rollout outputs: sample from output probability i = self.given_time while_condition = lambda i, inputs_embedded, decoder_state, rollout_outputs, max_len: tf.less(i, self.max_sent_len) def pred_body(i, inputs_embedded, decoder_state, rollout_outputs, max_len): print("pred body iter", i) # record rollout sentences next_outputs, decoder_state, next_inputs, decoder_finished = rollout_decoder.step(i, inputs_embedded, \ decoder_state) inputs = tf.cast(tf.reshape(tf.multinomial(next_outputs.rnn_output, 1), [self.batch_size, ]), tf.int32) inputs_embedded = embed_and_input_proj(inputs) rollout_outputs = rollout_outputs.write(i, inputs) return i+1, inputs_embedded, decoder_state, rollout_outputs, max_len i, inputs_embedded, self.rollout_decoder_state, self.rollout_outputs, _ = tf.while_loop(while_condition, pred_body, (i, inputs_embedded, self.rollout_decoder_state, \ self.rollout_outputs, self.max_sent_len)) self.rollout_outputs = self.rollout_outputs.stack() self.rollout_outputs = tf.transpose(self.rollout_outputs, perm=[1,0]) # train mode print("decoder for both pre-training and RL training") decoder_start_token_train= tf.ones(shape=[self.batch_size, 1], dtype=tf.int32) * params.start_token decoder_end_token_train= tf.ones(shape=[self.batch_size, 1], dtype=tf.int32) * params.end_token self.decoder_inputs_train = tf.concat([decoder_start_token_train, self.decoder_inputs], axis=1) self.decoder_inputs_length_train= self.decoder_inputs_length + 1 self.decoder_targets_train = tf.concat([self.decoder_inputs, decoder_end_token_train], axis=1) self.decoder_inputs_embedded_train = tf.nn.embedding_lookup(params=self.decoder_embeddings, \ ids=self.decoder_inputs_train) self.decoder_inputs_embedded_train = input_layer(self.decoder_inputs_embedded_train) training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded_train, \ sequence_length=self.decoder_inputs_length_train, \ time_major=False, name="training_helper") training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, \ helper=training_helper, \ initial_state=self.decoder_initial_state, \ output_layer=output_layer) max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_ouputs_len_train \ = seq2seq.dynamic_decode(\ decoder = training_decoder, \ output_time_major = False, \ impute_finished = True, \ maximum_iterations = max_decoder_length) # flat-and-pad: rnn_output: batch_size * max_decoder_length * decoder_vocab_size self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) logits_padding = tf.one_hot(indices=tf.ones(shape=[self.batch_size, self.max_sent_len+1-max_decoder_length], dtype=tf.int32) * params.end_token, \ depth=self.decoder_vocab_size, on_value=10.0, off_value=-20.0, axis=-1, dtype=self.dtype) # decoder_logits_train_pad: batch_size * (params.max_decoder_len+1 )* decoder_vocab_size self.decoder_logits_train_pad = tf.concat([self.decoder_logits_train, logits_padding], axis=1) # pre-train loss masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=self.max_sent_len+1, \ dtype=self.dtype, name="masks") self.pretrain_g_loss = seq2seq.sequence_loss(logits=tf.identity(self.decoder_logits_train_pad), \ targets=self.decoder_targets_train, \ weights=masks,\ average_across_timesteps=True,\ average_across_batch=True) # rl loss self.gen_prob = tf.nn.softmax(self.decoder_logits_train_pad) self.g_loss = -1.0 * tf.reduce_sum( tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape(self.decoder_targets_train, [-1])), self.decoder_vocab_size, 1.0, 0.0) * tf.log( tf.clip_by_value(tf.reshape(self.gen_prob, [-1, self.decoder_vocab_size]), 1e-20, 1.0)), 1) \ * tf.reshape(self.rewards, [-1])) self.init_optimizer()
def decode(self, encoder_outputs, batch_size): # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(self._hparams.get('attention_depth')), self._is_training, self._hparams.get('prenet_depths')), BahdanauAttention(self._hparams.get('attention_depth'), encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, self._hparams.get('decoder_depth')), ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))), ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, self._hparams.get('num_mels') * self._hparams.get('outputs_per_step')) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = dynamic_decode( BasicDecoder(output_cell, self._helper, decoder_init_state), maximum_iterations=self._hparams.get( 'max_iters')) # [N, T_out/r, M*r] mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, self._hparams.get('num_mels')]) # Post processing CHBG kwargs = { 'K': self._hparams.get('decoder_K'), 'bank_num_filters': self._hparams.get('decoder_bank_num_filters'), 'pooling_stride': self._hparams.get('decoder_pooling_stride'), 'pooling_width': self._hparams.get('decoder_pooling_width'), 'proj_num_filters': self._hparams.get('decoder_proj_num_filters'), 'proj_filter_width': self._hparams.get('decoder_proj_filter_width'), 'num_highway_layers': self._hparams.get('decoder_num_highway_layers'), 'highway_depth': self._hparams.get('decoder_highway_depth'), 'gru_num_cells': self._hparams.get('decoder_gru_num_cells') } post_out = cbhg(mel_outputs, None, self._is_training, 'post_cbhg', **kwargs) lin_outputs = tf.layers.dense(post_out, self._hparams.get('num_freq')) return mel_outputs, lin_outputs, final_decoder_state
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, is_generating, beam_size, max_generation_length): src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) src_embedding_weights = tf.get_variable("source_word_embeddings", [source_dict_dim, embedding_dim]) src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) # no peephole encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=src_forward_cell, cell_bw=src_reversed_cell, inputs=src_embedding, sequence_length=src_sequence_length, dtype=tf.float32) # concat the forward outputs and backward outputs encoded_vec = tf.concat(encoder_outputs, axis=2) # project the encoder outputs to size of decoder lstm encoded_proj = tf.contrib.layers.fully_connected( inputs=tf.reshape( encoded_vec, shape=[-1, embedding_dim * 2]), num_outputs=decoder_size, activation_fn=None, biases_initializer=None) encoded_proj_reshape = tf.reshape( encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) # get init state for decoder lstm's H backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) decoder_boot = tf.contrib.layers.fully_connected( inputs=tf.reshape( backword_first, shape=[-1, embedding_dim]), num_outputs=decoder_size, activation_fn=tf.nn.tanh, biases_initializer=None) # prepare the initial state for decoder lstm cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) initial_state = LSTMStateTuple(cell_init, decoder_boot) # create decoder lstm cell decoder_cell = LSTMCellWithSimpleAttention( decoder_size, encoded_vec if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), encoded_proj_reshape if not is_generating else seq2seq.tile_batch(encoded_proj_reshape, beam_size), src_sequence_length if not is_generating else seq2seq.tile_batch(src_sequence_length, beam_size), forget_bias=0.0) output_layer = Dense(target_dict_dim, name='output_projection') if not is_generating: trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, trg_word_idx) training_helper = seq2seq.TrainingHelper( inputs=trg_embedding, sequence_length=trg_sequence_length, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) # get the max length of target sequence max_decoder_length = tf.reduce_max(trg_sequence_length) decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask( lengths=trg_sequence_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # place holder of label sequence lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) # compute the loss loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=lbl_word_idx, weights=masks, average_across_timesteps=True, average_across_batch=True) # return feeding list and loss operator return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length, 'trg_word_idx': trg_word_idx, 'trg_sequence_length': trg_sequence_length, 'lbl_word_idx': lbl_word_idx }, loss else: start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], tf.int32) * START_TOKEN_IDX # share the same embedding weights with target word trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=decoder_cell, embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), start_tokens=start_tokens, end_token=END_TOKEN_IDX, initial_state=tf.nn.rnn_cell.LSTMStateTuple( tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), beam_width=beam_size, output_layer=output_layer) decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True,# error occurs maximum_iterations=max_generation_length) predicted_ids = decoder_outputs_decode.predicted_ids return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length }, predicted_ids
def BuildNetwork(self, learningRate): self.dataInput = tensorflow.placeholder(dtype=tensorflow.float32, shape=[None, None, self.featureShape], name='DataInput') self.seqInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='SeqInput') ############################################################################# # Batch Parameters ############################################################################# self.parameters['BatchSize'], self.parameters['TimeStep'], _ = tensorflow.unstack( tensorflow.shape(input=self.dataInput, name='DataShape')) ################################################################################################### # Encoder ################################################################################################### with tensorflow.variable_scope('Encoder_AE'): self.parameters['Encoder_Cell_Forward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers)], state_is_tuple=True) self.parameters['Encoder_Cell_Backward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers)], state_is_tuple=True) self.parameters['Encoder_Output_AE'], self.parameters['Encoder_FinalState_AE'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Encoder_Cell_Forward_AE'], cell_bw=self.parameters['Encoder_Cell_Backward_AE'], inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32) if self.attention is None: self.parameters['Decoder_InitalState_AE'] = [] for index in range(self.rnnLayers): self.parameters['Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple( c=tensorflow.concat([self.parameters['Encoder_FinalState_AE'][index][0].c, self.parameters['Encoder_FinalState_AE'][index][1].c], axis=1), h=tensorflow.concat([self.parameters['Encoder_FinalState_AE'][index][0].h, self.parameters['Encoder_FinalState_AE'][index][1].h], axis=1)) self.parameters['Decoder_InitalState_AE'].append(self.parameters['Encoder_Cell_Layer%d_AE' % index]) self.parameters['Decoder_InitalState_AE'] = tuple(self.parameters['Decoder_InitalState_AE']) else: self.attentionList = self.attention(dataInput=self.parameters['Encoder_Output_AE'], scopeName=self.attentionName, hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.attentionScope, blstmFlag=True) self.parameters['Decoder_InitalState_AE'] = [] for index in range(self.rnnLayers): self.parameters['Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple( c=self.attentionList['FinalResult'], h=tensorflow.concat( [self.parameters['Encoder_FinalState_AE'][index][0].h, self.parameters['Encoder_FinalState_AE'][index][1].h], axis=1)) self.parameters['Decoder_InitalState_AE'].append(self.parameters['Encoder_Cell_Layer%d_AE' % index]) self.parameters['Decoder_InitalState_AE'] = tuple(self.parameters['Decoder_InitalState_AE']) ############################################################################# # Decoder Label Pretreatment ############################################################################# self.parameters['Decoder_Helper_AE'] = seq2seq.TrainingHelper( inputs=self.dataInput, sequence_length=self.seqInput, name='Decoder_Helper_AE') with tensorflow.variable_scope('Decoder_AE'): self.parameters['Decoder_FC_AE'] = Dense(self.featureShape) self.parameters['Decoder_Cell_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[rnn.LSTMCell(num_units=self.hiddenNodules * 2) for _ in range(self.rnnLayers)], state_is_tuple=True) self.parameters['Decoder_AE'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell_AE'], helper=self.parameters['Decoder_Helper_AE'], initial_state=self.parameters['Decoder_InitalState_AE'], output_layer=self.parameters['Decoder_FC_AE']) self.parameters['Decoder_Logits_AE'], self.parameters['Decoder_FinalState_AE'], self.parameters[ 'Decoder_FinalSeq_AE'] = seq2seq.dynamic_decode(decoder=self.parameters['Decoder_AE']) ############################################################################# # Losses ############################################################################# self.parameters['Loss_AE'] = tensorflow.losses.absolute_difference( labels=self.dataInput, predictions=self.parameters['Decoder_Logits_AE'][0], weights=self.weight) self.train = tensorflow.train.AdamOptimizer(learning_rate=learningRate).minimize(self.parameters['Loss_AE'])
def BuildNetwork(self, learningRate): self.dataInput = tensorflow.placeholder( dtype=tensorflow.float32, shape=[self.batchSize, 1000, 40], name='dataInput') self.dataSeqInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[self.batchSize], name='dataSeqInput') self.labelInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[self.batchSize, None], name='labelInput') self.labelSeqInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[self.batchSize], name='labelSeqInput') self.parameters['EmbeddingDictionary'] = tensorflow.Variable( initial_value=tensorflow.truncated_normal([50, 256]), dtype=tensorflow.float32, name='EmbeddingDictionary') self.parameters['EmbeddingResult'] = tensorflow.nn.embedding_lookup( params=self.parameters['EmbeddingDictionary'], ids=self.labelInput, name='EmbeddingResult') with tensorflow.name_scope('Encoder'): self.parameters['Encoder_FW_Cell'] = rnn.LSTMCell( num_units=self.hiddenNoduleNumber, name='Encoder_FW_Cell') self.parameters['Encoder_BW_Cell'] = rnn.LSTMCell( num_units=self.hiddenNoduleNumber, name='Encoder_BW_Cell') [self.parameters['Encoder_FW_Output'], self.parameters['Encoder_BW_Output']], \ [self.parameters['Encoder_FW_FinalState'], self.parameters['Encoder_BW_FinalState']] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Encoder_FW_Cell'], cell_bw=self.parameters['Encoder_BW_Cell'], inputs=self.dataInput, sequence_length=self.dataSeqInput, dtype=tensorflow.float32) self.parameters['EncoderOutput'] = tensorflow.concat( [ self.parameters['Encoder_FW_Output'], self.parameters['Encoder_BW_Output'] ], axis=2, name='EncoderOutput') self.parameters['Encoder_FinalState_C'] = tensorflow.concat( [ self.parameters['Encoder_FW_FinalState'].c, self.parameters['Encoder_BW_FinalState'].c ], axis=1, name='Encoder_FinalState_C') self.parameters['Encoder_FinalState_H'] = tensorflow.concat( [ self.parameters['Encoder_FW_FinalState'].h, self.parameters['Encoder_BW_FinalState'].h ], axis=1, name='Encoder_FinalState_H') self.parameters['Encoder_FinalState'] = rnn.LSTMStateTuple( c=self.parameters['Encoder_FinalState_C'], h=self.parameters['Encoder_FinalState_H']) ################################################################################# self.parameters['Helper'] = seq2seq.GreedyEmbeddingHelper( embedding=self.parameters['EmbeddingDictionary'], start_tokens=tensorflow.ones(self.batchSize, dtype=tensorflow.int32) * 40, end_token=0) self.parameters['Decoder_Cell'] = rnn.LSTMCell(num_units=2 * self.hiddenNoduleNumber) self.parameters['Decoder'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell'], helper=self.parameters['Helper'], initial_state=self.parameters['Encoder_FinalState']) self.parameters['DecoderOutput'], self.parameters['DecoderFinalState'], self.parameters['DecoderSeqLen'] = \ seq2seq.dynamic_decode(decoder=self.parameters['Decoder'], output_time_major=False, maximum_iterations=tensorflow.reduce_max(self.labelSeqInput)) self.parameters['Logits'] = tensorflow.layers.dense( inputs=self.parameters['DecoderOutput'][0], units=50, activation=None, name='Logits') # self.parameters['Mask'] = tensorflow.to_float(tensorflow.not_equal(self.labelInput, 0)) self.parameters['Loss'] = tensorflow.reduce_mean( tensorflow.nn.softmax_cross_entropy_with_logits_v2( labels=tensorflow.one_hot(self.labelInput, depth=50, dtype=tensorflow.float32), logits=self.parameters['Logits']), name='Loss') self.train = tensorflow.train.AdamOptimizer( learning_rate=learningRate).minimize(self.parameters['Loss'])
def _init_tensorflow(self, infer: bool=False) -> 'tf': """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Parameters ---------- infer : bool If True, initialize model for inference. If False, initialize model for training. Returns ------- module TensorFlow module. """ # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' self.cell_fn = { "lstm": rnn.BasicLSTMCell, "gru": rnn.GRUCell, "rnn": rnn.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size cells_lst = [self.cell_fn(self.rnn_size, state_is_tuple=True) for _ in range(self.num_layers)] self.cell = rnn.MultiRNNCell(cells_lst, state_is_tuple=True) with tf.device("/cpu:0"): # Inputs self.encoder_input = tf.placeholder(tf.int32, [batch_size, seq_length]) self.decoder_input = tf.placeholder(tf.int32, [batch_size, seq_length]) self.target_weights = tf.placeholder(tf.int32, [batch_size, seq_length]) self.lengths = tf.placeholder(tf.int32, [batch_size]) self.q = tf.FIFOQueue(capacity=4, dtypes=[tf.int32, tf.int32, tf.int32, tf.int32], shapes=[tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size, seq_length]), tf.TensorShape([batch_size])]) self.enqueue_op = self.q.enqueue((self.encoder_input, self.decoder_input, self.target_weights, self.lengths)) next_example = self.q.dequeue() self.inputs = next_example[0] self.dec_inp = next_example[1] self.tweights = tf.to_float(next_example[2]) self.lens = next_example[3] scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding_dec = tf.get_variable("embedding_dec", [vocab_size, self.rnn_size]) dec_inp2 = tf.nn.embedding_lookup(embedding_dec, self.dec_inp) encoder = SeqEncoder(self.model_type, self.rnn_size, self.num_layers, batch_size, vocab_size) encoder_state = encoder.encode(self.inputs, self.lens) self.mean_latent, self.logvar_latent = encoder_to_latent(encoder_state, self.rnn_size, 32, self.num_layers, tf.float32) self.latent, self.KL_obj, self.KL_cost = sample(self.mean_latent, self.logvar_latent, 32) self.decoder_initial_state = latent_to_decoder(self.latent, self.rnn_size, 32, self.num_layers, tf.float32) decoder_initial_state2 = tuple([rnn.LSTMStateTuple(*single_layer_state) for single_layer_state in self.decoder_initial_state]) helper = seq2seq.TrainingHelper(dec_inp2, self.lens, time_major=False) decoder = seq2seq.BasicDecoder(self.cell, helper, decoder_initial_state2, Dense(vocab_size)) self.final_outputs, self.final_state = seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, swap_memory=True, scope='rnnlm') self.final_out = self.final_outputs.rnn_output self.probs = tf.nn.softmax(self.final_out) self.cost = seq2seq.sequence_loss(self.final_out, self.inputs, self.tweights) self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost + self.KL_obj, tvars, aggregation_method = 2), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) return tf
def _build(self, decoding_strategy="train_greedy", initial_state=None, inputs=None, sequence_length=None, embedding=None, start_tokens=None, end_token=None, softmax_temperature=None, max_decoding_length=None, impute_finished=False, output_time_major=False, input_time_major=False, helper=None, mode=None, **kwargs): """Performs decoding. This is a shared interface for both :class:`~texar.tf.modules.BasicRNNDecoder` and :class:`~texar.tf.modules.AttentionRNNDecoder`. The function provides **3 ways** to specify the decoding method, with varying flexibility: 1. The :attr:`decoding_strategy` argument: A string taking value of: - **"train_greedy"**: decoding in teacher-forcing fashion \ (i.e., feeding \ `ground truth` to decode the next step), and each sample is \ obtained by taking the `argmax` of the RNN output logits. \ Arguments :attr:`(inputs, sequence_length, input_time_major)` \ are required for this strategy, and argument :attr:`embedding` \ is optional. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding \ the `generated` sample to decode the next step), and each sample\ is obtained by taking the `argmax` of the RNN output logits.\ Arguments :attr:`(embedding, start_tokens, end_token)` are \ required for this strategy, and argument \ :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and each sample is obtained by `random sampling` from the RNN output distribution. Arguments \ :attr:`(embedding, start_tokens, end_token)` are \ required for this strategy, and argument \ :attr:`max_decoding_length` is optional. This argument is used only when argument :attr:`helper` is `None`. Example: .. code-block:: python embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size) # Teacher-forcing decoding outputs_1, _, _ = decoder( decoding_strategy='train_greedy', inputs=embedder(data_batch['text_ids']), sequence_length=data_batch['length']-1) # Random sample decoding. Gets 100 sequence samples outputs_2, _, sequence_length = decoder( decoding_strategy='infer_sample', start_tokens=[data.vocab.bos_token_id]*100, end_token=data.vocab.eos.token_id, embedding=embedder, max_decoding_length=60) 2. The :attr:`helper` argument: An instance of subclass of \ :class:`texar.tf.modules.Helper`. This provides a superset of decoding strategies than above, for example: - :class:`~texar.tf.modules.TrainingHelper` corresponding to the \ "train_greedy" strategy. - :class:`~texar.tf.modules.GreedyEmbeddingHelper` and \ :class:`~texar.tf.modules.SampleEmbeddingHelper` corresponding to \ the "infer_greedy" and "infer_sample", respectively. - :class:`~texar.tf.modules.TopKSampleEmbeddingHelper` for Top-K \ sample decoding. - :class:`ScheduledEmbeddingTrainingHelper` and \ :class:`ScheduledOutputTrainingHelper` for scheduled \ sampling. - :class:`~texar.tf.modules.SoftmaxEmbeddingHelper` and \ :class:`~texar.tf.modules.GumbelSoftmaxEmbeddingHelper` for \ soft decoding and gradient backpropagation. Helpers give the maximal flexibility of configuring the decoding\ strategy. Example: .. code-block:: python embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size) # Teacher-forcing decoding, same as above with # `decoding_strategy='train_greedy'` helper_1 = tx.modules.TrainingHelper( inputs=embedders(data_batch['text_ids']), sequence_length=data_batch['length']-1) outputs_1, _, _ = decoder(helper=helper_1) # Gumbel-softmax decoding helper_2 = GumbelSoftmaxEmbeddingHelper( embedding=embedder, start_tokens=[data.vocab.bos_token_id]*100, end_token=data.vocab.eos_token_id, tau=0.1) outputs_2, _, sequence_length = decoder( max_decoding_length=60, helper=helper_2) 3. :attr:`hparams["helper_train"]` and :attr:`hparams["helper_infer"]`:\ Specifying the helper through hyperparameters. Train and infer \ strategy is toggled based on :attr:`mode`. Appriopriate arguments \ (e.g., :attr:`inputs`, :attr:`start_tokens`, etc) are selected to \ construct the helper. Additional arguments for helper constructor \ can be provided either through :attr:`**kwargs`, or through \ :attr:`hparams["helper_train/infer"]["kwargs"]`. This means is used only when both :attr:`decoding_strategy` and \ :attr:`helper` are `None`. Example: .. code-block:: python h = { "helper_infer": { "type": "GumbelSoftmaxEmbeddingHelper", "kwargs": { "tau": 0.1 } } } embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size, hparams=h) # Gumbel-softmax decoding output, _, _ = decoder( decoding_strategy=None, # Sets to None explicit embedding=embedder, start_tokens=[data.vocab.bos_token_id]*100, end_token=data.vocab.eos_token_id, max_decoding_length=60, mode=tf.estimator.ModeKeys.PREDICT) # PREDICT mode also shuts down dropout Args: decoding_strategy (str): A string specifying the decoding strategy. Different arguments are required based on the strategy. Ignored if :attr:`helper` is given. initial_state (optional): Initial state of decoding. If `None` (default), zero state is used. inputs (optional): Input tensors for teacher forcing decoding. Used when `decoding_strategy` is set to "train_greedy", or when `hparams`-configured helper is used. - If :attr:`embedding` is `None`, `inputs` is directly \ fed to the decoder. E.g., in `"train_greedy"` strategy, \ `inputs` must be a 3D Tensor of shape \ `[batch_size, max_time, emb_dim]` (or \ `[max_time, batch_size, emb_dim]` if `input_time_major`==True). - If `embedding` is given, `inputs` is used as index \ to look up embeddings and feed in the decoder. \ E.g., if `embedding` is an instance of \ :class:`~texar.tf.modules.WordEmbedder`, \ then :attr:`inputs` is usually a 2D int Tensor \ `[batch_size, max_time]` (or \ `[max_time, batch_size]` if `input_time_major`==True) \ containing the token indexes. sequence_length (optional): A 1D int Tensor containing the sequence length of :attr:`inputs`. Used when `decoding_strategy="train_greedy"` or `hparams`-configured helper is used. embedding (optional): Embedding used when: - "infer_greedy" or "infer_sample" `decoding_strategy` is \ used. This can be a callable or the `params` argument for \ :tf_main:`embedding_lookup <nn/embedding_lookup>`. \ If a callable, it can take a vector tensor of token `ids`, \ or take two arguments (`ids`, `times`), where `ids` \ is a vector tensor of token ids, and `times` is a vector tensor\ of time steps (i.e., position ids). The latter case can be used\ when attr:`embedding` is a combination of word embedding and\ position embedding. `embedding` is required in this case. - "train_greedy" `decoding_strategy` is used.\ This can be a callable or the `params` argument for \ :tf_main:`embedding_lookup <nn/embedding_lookup>`. \ If a callable, it can take :attr:`inputs` and returns \ the input embedding. `embedding` is optional in this case. start_tokens (optional): A int Tensor of shape `[batch_size]`, the start tokens. Used when `decoding_strategy="infer_greedy"` or `"infer_sample"`, or when the helper specified in `hparams` is used. Example: .. code-block:: python data = tx.data.MonoTextData(hparams) iterator = DataIterator(data) batch = iterator.get_next() bos_token_id = data.vocab.bos_token_id start_tokens=tf.ones_like(batch['length'])*bos_token_id end_token (optional): A int 0D Tensor, the token that marks end of decoding. Used when `decoding_strategy="infer_greedy"` or `"infer_sample"`, or when the helper specified in `hparams` is used. softmax_temperature (optional): A float 0D Tensor, value to divide the logits by before computing the softmax. Larger values (above 1.0) result in more random samples. Must > 0. If `None`, 1.0 is used. Used when `decoding_strategy="infer_sample"`. max_decoding_length: A int scalar Tensor indicating the maximum allowed number of decoding steps. If `None` (default), either `hparams["max_decoding_length_train"]` or `hparams["max_decoding_length_infer"]` is used according to :attr:`mode`. impute_finished (bool): If `True`, then states for batch entries which are marked as finished get copied through and the corresponding outputs get zeroed out. This causes some slowdown at each time step, but ensures that the final state and outputs have the correct values and that backprop ignores time steps that were marked as finished. output_time_major (bool): If `True`, outputs are returned as time major tensors. If `False` (default), outputs are returned as batch major tensors. input_time_major (optional): Whether the :attr:`inputs` tensor is time major. Used when `decoding_strategy="train_greedy"` or `hparams`-configured helper is used. helper (optional): An instance of :class:`texar.tf.modules.Helper` that defines the decoding strategy. If given, `decoding_strategy` and helper configs in :attr:`hparams` are ignored. mode (str, optional): A string taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`. If `TRAIN`, training related hyperparameters are used (e.g., `hparams['max_decoding_length_train']`), otherwise, inference related hyperparameters are used (e.g., `hparams['max_decoding_length_infer']`). If `None` (default), `TRAIN` mode is used. **kwargs: Other keyword arguments for constructing helpers defined by `hparams["helper_trainn"]` or `hparams["helper_infer"]`. Returns: `(outputs, final_state, sequence_lengths)`, where - **`outputs`**: an object containing the decoder output on all \ time steps. - **`final_state`**: is the cell state of the final time step. - **`sequence_lengths`**: is an int Tensor of shape `[batch_size]` \ containing the length of each sample. """ # Helper if helper is not None: pass elif decoding_strategy is not None: if decoding_strategy == "train_greedy": helper = rnn_decoder_helpers._get_training_helper( inputs, sequence_length, embedding, input_time_major) elif decoding_strategy == "infer_greedy": helper = tx_helper.GreedyEmbeddingHelper( embedding, start_tokens, end_token) elif decoding_strategy == "infer_sample": helper = tx_helper.SampleEmbeddingHelper( embedding, start_tokens, end_token, softmax_temperature) else: raise ValueError( "Unknown decoding strategy: {}".format(decoding_strategy)) else: if is_train_mode_py(mode): kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict()) helper_type = self._hparams.helper_train.type else: kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict()) helper_type = self._hparams.helper_infer.type kwargs_.update({ "inputs": inputs, "sequence_length": sequence_length, "time_major": input_time_major, "embedding": embedding, "start_tokens": start_tokens, "end_token": end_token, "softmax_temperature": softmax_temperature }) kwargs_.update(kwargs) helper = rnn_decoder_helpers.get_helper(helper_type, **kwargs_) self._helper = helper # Initial state if initial_state is not None: self._initial_state = initial_state else: self._initial_state = self.zero_state(batch_size=self.batch_size, dtype=tf.float32) # Maximum decoding length max_l = max_decoding_length if max_l is None: max_l_train = self._hparams.max_decoding_length_train if max_l_train is None: max_l_train = utils.MAX_SEQ_LENGTH max_l_infer = self._hparams.max_decoding_length_infer if max_l_infer is None: max_l_infer = utils.MAX_SEQ_LENGTH max_l = tf.cond(is_train_mode(mode), lambda: max_l_train, lambda: max_l_infer) self.max_decoding_length = max_l # Decode outputs, final_state, sequence_lengths = dynamic_decode( decoder=self, impute_finished=impute_finished, maximum_iterations=max_l, output_time_major=output_time_major) if not self._built: self._add_internal_trainable_variables() # Add trainable variables of `self._cell` which may be # constructed externally. self._add_trainable_variable( layers.get_rnn_cell_trainable_variables(self._cell)) if isinstance(self._output_layer, tf.layers.Layer): self._add_trainable_variable( self._output_layer.trainable_variables) # Add trainable variables of `self._beam_search_rnn_cell` which # may already be constructed and used. if self._beam_search_cell is not None: self._add_trainable_variable( self._beam_search_cell.trainable_variables) self._built = True return outputs, final_state, sequence_lengths
def decode(self, encoder_outputs, encoder_state, source_sequence_length): with tf.variable_scope("Decoder") as scope: beam_width = self.beam_width decoder_type = self.decoder_type seq_max_len = self.seq_max_len batch_size = tf.shape(encoder_outputs)[0] if self.path_embed_method == "lstm": self.decoder_cell = self._build_decode_cell() if self.mode == "test" and beam_width > 0: memory = seq2seq.tile_batch(self.encoder_outputs, multiplier=beam_width) source_sequence_length = seq2seq.tile_batch(self.source_sequence_length, multiplier=beam_width) encoder_state = seq2seq.tile_batch(self.encoder_state, multiplier=beam_width) batch_size = self.batch_size * beam_width else: memory = encoder_outputs source_sequence_length = source_sequence_length encoder_state = encoder_state attention_mechanism = seq2seq.BahdanauAttention(self.hidden_layer_dim, memory, memory_sequence_length=source_sequence_length) self.decoder_cell = seq2seq.AttentionWrapper(self.decoder_cell, attention_mechanism, attention_layer_size=self.hidden_layer_dim) self.decoder_initial_state = self.decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state) projection_layer = Dense(self.word_vocab_size, use_bias=False) """For training the model""" if self.mode == "train": decoder_train_helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_train_inputs_embedded, self.decoder_train_length) decoder_train = seq2seq.BasicDecoder(self.decoder_cell, decoder_train_helper, self.decoder_initial_state, projection_layer) decoder_outputs_train, decoder_states_train, decoder_seq_len_train = seq2seq.dynamic_decode(decoder_train) decoder_logits_train = decoder_outputs_train.rnn_output self.decoder_logits_train = tf.reshape(decoder_logits_train, [batch_size, -1, self.word_vocab_size]) """For test the model""" # if self.mode == "infer" or self.if_pred_on_dev: if decoder_type == "greedy": decoder_infer_helper = seq2seq.GreedyEmbeddingHelper(self.word_embeddings, tf.ones([batch_size], dtype=tf.int32), self.EOS) decoder_infer = seq2seq.BasicDecoder(self.decoder_cell, decoder_infer_helper, self.decoder_initial_state, projection_layer) elif decoder_type == "beam": decoder_infer = seq2seq.BeamSearchDecoder(cell=self.decoder_cell, embedding=self.word_embeddings, start_tokens=tf.ones([batch_size], dtype=tf.int32), end_token=self.EOS, initial_state=self.decoder_initial_state, beam_width=beam_width, output_layer=projection_layer) decoder_outputs_infer, decoder_states_infer, decoder_seq_len_infer = seq2seq.dynamic_decode(decoder_infer, maximum_iterations=seq_max_len) if decoder_type == "beam": self.decoder_logits_infer = tf.no_op() self.sample_id = decoder_outputs_infer.predicted_ids elif decoder_type == "greedy": self.decoder_logits_infer = decoder_outputs_infer.rnn_output self.sample_id = decoder_outputs_infer.sample_id
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None: raise ValueError( 'Mel targets are provided without corresponding token_targets') print(stop_token_targets[0]) with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers) #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate) #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step) #<stop_token> projection layer stop_projection = StopProjection(is_training) #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #We'll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iterations) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels) #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels) projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def rbmE_gruD(mode, features, labels, params): inp = features["x"] if state != "Infering": ids = features["ids"] weights = features["weights"] batch_size = params["batch_size"] #Encoder enc_cell = rnn.NASCell(num_units=NUM_UNITS) enc_out, enc_state = tf.nn.dynamic_rnn(enc_cell, inp, time_major=False, dtype=tf.float32) #Decoder cell = rnn.NASCell(num_units=NUM_UNITS) _, embeddings = load_processed_embeddings(sess=tf.InteractiveSession()) out_lengths = tf.constant(seq_len, shape=[batch_size]) if state != "Infering": #sampling method for training train_helper = seq2seq.TrainingHelper(labels, out_lengths, time_major=False) ''' train_helper=seq2seq.ScheduledEmbeddingTrainingHelper(inputs=labels, sequence_length=out_lengths, embedding=embeddings, sampling_probability=probs) ''' #sampling method for evaluation start_tokens = tf.zeros([batch_size], dtype=tf.int32) infer_helper = seq2seq.GreedyEmbeddingHelper(embedding=embeddings, start_tokens=start_tokens, end_token=END) #infer_helper = seq2seq.SampleEmbeddingHelper(embeddings,start_tokens=start_tokens,end_token=END) #infer_helper=seq2seq.ScheduledEmbeddingTrainingHelper(inputs=inp,sequence_length=out_lengths,embedding=embeddings,sampling_probability=1.0) projection_layer = layers_core.Dense(vocab_size, use_bias=False) def decode(helper): decoder = seq2seq.BasicDecoder(cell=cell, helper=helper, initial_state=enc_state, output_layer=projection_layer) #decoder.tracks_own_finished=True (dec_outputs, _, _) = seq2seq.dynamic_decode(decoder, maximum_iterations=seq_len) #(dec_outputs,_,_) = seq2seq.dynamic_decode(decoder) dec_ids = dec_outputs.sample_id logits = dec_outputs.rnn_output return dec_ids, logits #equalize logits, labels and weight lengths incase of early finish in decoder def norm_logits_loss(logts, ids, weights): current_ts = tf.to_int32( tf.minimum(tf.shape(ids)[1], tf.shape(logts)[1])) logts = tf.slice(logts, begin=[0, 0, 0], size=[-1, current_ts, -1]) ids = tf.slice(ids, begin=[0, 0], size=[-1, current_ts]) weights = tf.slice(weights, begin=[0, 0], size=[-1, current_ts]) return logts, ids, weights #training mode if state == "Training": dec_ids, logits = decode(train_helper) # some sample_id are overwritten with '-1's #dec_ids = tf.argmax(logits, axis=2) tf.identity(dec_ids, name="predictions") logits, ids, weights = norm_logits_loss(logits, ids, weights) loss = tf.contrib.seq2seq.sequence_loss(logits, ids, weights=weights) learning_rate = 0.001 #0.0001 tf.identity(learning_rate, name="learning_rate") #evaluation mode if state == "Evaluating" or state == "Testing": eval_dec_ids, eval_logits = decode(infer_helper) #eval_dec_ids = tf.argmax(eval_logits, axis=2) tf.identity(eval_dec_ids, name="predictions") #equalize logits, labels and weight lengths incase of early finish in decoder eval_logits, ids, weights = norm_logits_loss(eval_logits, ids, weights) ''' current_ts = tf.to_int32(tf.minimum(tf.shape(ids)[1], tf.shape(eval_logits)[1])) ids = tf.slice(ids, begin=[0, 0], size=[-1, current_ts]) weights = tf.slice(weights, begin=[0, 0], size=[-1, current_ts]) #mask_ = tf.sequence_mask(lengths=target_sequence_length, maxlen=current_ts, dtype=eval_logits.dtype) eval_logits = tf.slice(eval_logits, begin=[0,0,0], size=[-1, current_ts, -1]) ''' eval_loss = tf.contrib.seq2seq.sequence_loss(eval_logits, ids, weights=weights) #beamSearch decoder init_state = tf.contrib.seq2seq.tile_batch(enc_state, multiplier=5) beamSearch_decoder = seq2seq.BeamSearchDecoder( cell, embeddings, start_tokens, end_token=END, initial_state=init_state, beam_width=5, output_layer=projection_layer) (infer_outputs, _, _) = seq2seq.dynamic_decode(beamSearch_decoder, maximum_iterations=seq_len) infer_ids = infer_outputs.predicted_ids infer_probs = infer_outputs.beam_search_decoder_output.scores infer_probs = tf.reduce_prod(infer_probs, axis=1) infer_pos = tf.argmax(infer_probs, axis=1) infers = {"ids": infer_ids, "pos": infer_pos} if mode == tf.estimator.ModeKeys.TRAIN: train_op = layers.optimize_loss(loss, tf.train.get_global_step(), optimizer='Adam', learning_rate=learning_rate, clip_gradients=5.0) spec = tf.estimator.EstimatorSpec(mode=mode, predictions=dec_ids, loss=loss, train_op=train_op) #evaluation mode elif mode == tf.estimator.ModeKeys.EVAL: spec = tf.estimator.EstimatorSpec(mode=mode, loss=eval_loss, predictions=eval_dec_ids) else: spec = tf.estimator.EstimatorSpec(mode=mode, predictions=infers) return spec
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ log("tacotron.py:initialize():row42") if mel_targets is None and stop_token_targets is not None: raise ValueError('no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') log("tacotron.py:initialize():row56") split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32]*hp.tacotron_num_gpus lout_float = [tf.float32]*hp.tacotron_num_gpus tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:,1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:,2]], lout_float) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func(split_func, [linear_targets, split_infos[:,3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range (hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append(tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append(tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append(tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] log("tacotron.py:initialize():row100") # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta log("tacotron.py:initialize():row113") # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) log("tacotron.py:initialize():row119") #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape log("tacotron.py:initialize():row131") #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape(tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') log("tacotron.py:initialize():row147") #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None log("tacotron.py:initialize():row170") #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) log("tacotron.py:initialize():row178") # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) log("tacotron.py:initialize():row184") #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') log("tacotron.py:initialize():row188") #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) log("tacotron.py:initialize():row197") #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection(hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append(stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus+hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format(tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format(tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format(tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format(self.tower_decoder_output[i].shape)) log(' residual out: {}'.format(tower_residual[i].shape)) log(' projected residual out: {}'.format(tower_projected_residual[i].shape)) log(' mel out: {}'.format(self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format(self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format(self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def __call__(self, mean_image_features=None, mean_object_features=None, spatial_image_features=None, spatial_object_features=None, seq_inputs=None, lengths=None): assert (mean_image_features is not None or mean_object_features is not None or spatial_image_features is not None or spatial_object_features is not None) use_beam_search = (seq_inputs is None or lengths is None) if mean_image_features is not None: batch_size = tf.shape(mean_image_features)[0] elif mean_object_features is not None: batch_size = tf.shape(mean_object_features)[0] elif spatial_image_features is not None: batch_size = tf.shape(spatial_image_features)[0] elif spatial_object_features is not None: batch_size = tf.shape(spatial_object_features)[0] initial_state = self.image_caption_cell.zero_state( batch_size, tf.float32) if use_beam_search: if mean_image_features is not None: mean_image_features = seq2seq.tile_batch( mean_image_features, multiplier=self.beam_size) self.image_caption_cell.mean_image_features = mean_image_features if mean_object_features is not None: mean_object_features = seq2seq.tile_batch( mean_object_features, multiplier=self.beam_size) self.image_caption_cell.mean_object_features = mean_object_features if spatial_image_features is not None: spatial_image_features = seq2seq.tile_batch( spatial_image_features, multiplier=self.beam_size) self.image_caption_cell.spatial_image_features = spatial_image_features if spatial_object_features is not None: spatial_object_features = seq2seq.tile_batch( spatial_object_features, multiplier=self.beam_size) self.image_caption_cell.spatial_object_features = spatial_object_features initial_state = seq2seq.tile_batch(initial_state, multiplier=self.beam_size) decoder = seq2seq.BeamSearchDecoder( self.image_caption_cell, self.embeddings_map, tf.fill([batch_size], self.word_vocabulary.start_id), self.word_vocabulary.end_id, initial_state, self.beam_size, output_layer=self.logits_layer) outputs, state, lengths = seq2seq.dynamic_decode( decoder, maximum_iterations=self.maximum_iterations) ids = tf.transpose(outputs.predicted_ids, [0, 2, 1]) sequence_length = tf.shape(ids)[2] flat_ids = tf.reshape( ids, [batch_size * self.beam_size, sequence_length]) seq_inputs = tf.concat([ tf.fill([batch_size * self.beam_size, 1], self.word_vocabulary.start_id), flat_ids ], 1) if mean_image_features is not None: self.image_caption_cell.mean_image_features = mean_image_features if mean_object_features is not None: self.image_caption_cell.mean_object_features = mean_object_features if spatial_image_features is not None: self.image_caption_cell.spatial_image_features = spatial_image_features if spatial_object_features is not None: self.image_caption_cell.spatial_object_features = spatial_object_features activations, _state = tf.nn.dynamic_rnn( self.image_caption_cell, tf.nn.embedding_lookup(self.embeddings_map, seq_inputs), sequence_length=tf.reshape(lengths, [-1]), initial_state=initial_state) logits = self.logits_layer(activations) if use_beam_search: length = tf.shape(logits)[1] logits = tf.reshape( logits, [batch_size, self.beam_size, length, self.vocab_size]) return logits, tf.argmax(logits, axis=-1, output_type=tf.int32)
def build_train_decoder(self): self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.embedding, ids=self.decoder_inputs_train) if self.train_mode == 'ground_truth': training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') elif self.train_mode == 'scheduled_sampling': training_helper = seq2seq.ScheduledEmbeddingTrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs), sampling_probability=self.sampling_probability, name='scheduled_embedding_training_helper') else: raise NotImplementedError( 'Train mode: {} is not yet implemented'.format( self.train_mode)) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) # NOTE(sdsuo): Not sure why this is necessary self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer()
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size beam_width = config.beam_width GO_TOKEN = 0 EOS_TOKEN = 1 JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat), trainable=True) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) if config.na: na_bias = tf.get_variable("na_bias", shape=[], dtype='float') na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]), [N, 1]) # [N, 1] concat_flat_logits = tf.concat( axis=1, values=[na_bias_tiled, flat_logits]) concat_flat_yp = tf.nn.softmax(concat_flat_logits) na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]), [1]) flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1]) concat_flat_logits2 = tf.concat( axis=1, values=[na_bias_tiled, flat_logits2]) concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2) na_prob2 = tf.squeeze( tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1]) # [N] flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1]) self.concat_logits = concat_flat_logits self.concat_logits2 = concat_flat_logits2 self.na_prob = na_prob * na_prob2 yp = tf.reshape(flat_yp, [-1, M, JX]) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) wyp = tf.nn.sigmoid(logits2) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 self.wyp = wyp with tf.variable_scope("q_gen"): # Question Generation Using (Paragraph & Predicted Ans Pos) NM = config.max_num_sents * config.batch_size # Separated encoder #ss = tf.reshape(xx, (-1, JX, dw+dco)) q_worthy = tf.reduce_sum( tf.to_int32(self.y), axis=2 ) # so we get probability distribution of answer-likely. (N, M) q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)), axis=1) # (N) -> (N, 1) q_worthy = tf.concat([ tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1), q_worthy ], axis=1) # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ... ss = tf.gather_nd(xx, q_worthy) syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1) syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1) ss_with_ans = tf.concat([ss, syp, syp2], axis=2) qg_dim = 600 cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \ rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob) s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ss_with_ans, dtype=tf.float32) s_outputs = tf.concat(s_outputs, axis=2) s_states = tf.concat(s_states, axis=1) start_tokens = tf.zeros([N], dtype=tf.int32) self.inp_q_with_GO = tf.concat( [tf.expand_dims(start_tokens, axis=1), self.q], axis=1) # supervise if mode is train if config.mode == "train": emb_q = tf.nn.embedding_lookup(params=word_emb_mat, ids=self.inp_q_with_GO) #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw)) train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N) else: s_outputs = seq2seq.tile_batch(s_outputs, multiplier=beam_width) s_states = seq2seq.tile_batch(s_states, multiplier=beam_width) cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2), input_keep_prob=config.input_keep_prob) attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim * 2, memory=s_outputs) attn_cell = seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=qg_dim * 2, output_attention=True, alignment_history=False) total_glove_vocab_size = 78878 #72686 out_cell = rnn.OutputProjectionWrapper(attn_cell, VW + total_glove_vocab_size) if config.mode == "train": decoder_initial_states = out_cell.zero_state( batch_size=N, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BasicDecoder( cell=out_cell, helper=train_helper, initial_state=decoder_initial_states) else: decoder_initial_states = out_cell.zero_state( batch_size=N * beam_width, dtype=tf.float32).clone(cell_state=s_states) decoder = seq2seq.BeamSearchDecoder( cell=out_cell, embedding=word_emb_mat, start_tokens=start_tokens, end_token=EOS_TOKEN, initial_state=decoder_initial_states, beam_width=beam_width, length_penalty_weight=0.0) outputs = seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=JQ) if config.mode == "train": gen_q = outputs[0].sample_id gen_q_prob = outputs[0].rnn_output gen_q_states = outputs[1] else: gen_q = outputs[0].predicted_ids[:, :, 0] gen_q_prob = tf.nn.embedding_lookup( params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0]) gen_q_states = outputs[1] self.gen_q = gen_q self.gen_q_prob = gen_q_prob self.gen_q_states = gen_q_states
def sample(self, n, max_length=None, z=None, temperature=None, start_inputs=None, beam_width=None, end_token=None): """Overrides BaseLstmDecoder `sample` method to add optional beam search. Args: n: Scalar number of samples to return. max_length: (Optional) Scalar maximum sample length to return. Required if data representation does not include end tokens. z: (Optional) Latent vectors to sample from. Required if model is conditional. Sized `[n, z_size]`. temperature: (Optional) The softmax temperature to use when not doing beam search. Defaults to 1.0. Ignored when `beam_width` is provided. start_inputs: (Optional) Initial inputs to use for batch. Sized `[n, output_depth]`. beam_width: (Optional) Width of beam to use for beam search. Beam search is disabled if not provided. end_token: (Optional) Scalar token signaling the end of the sequence to use for early stopping. Returns: samples: Sampled sequences. Sized `[n, max_length, output_depth]`. final_state: The final states of the decoder. Raises: ValueError: If `z` is provided and its first dimension does not equal `n`. """ if beam_width is None: end_fn = (None if end_token is None else lambda x: tf.equal(tf.argmax(x, axis=-1), end_token)) return super(CategoricalLstmDecoder, self).sample(n, max_length, z, temperature, start_inputs, end_fn) # If `end_token` is not given, use an impossible value. end_token = self._output_depth if end_token is None else end_token if z is not None and z.shape[0].value != n: raise ValueError( '`z` must have a first dimension that equals `n` when given. ' 'Got: %d vs %d' % (z.shape[0].value, n)) if temperature is not None: tf.logging.warning( '`temperature` is ignored when using beam search.') # Use a dummy Z in unconditional case. z = tf.zeros((n, 0), tf.float32) if z is None else z # If not given, start with dummy `-1` token and replace with zero vectors in # `embedding_fn`. start_tokens = (tf.argmax(start_inputs, axis=-1, output_type=tf.int32) if start_inputs is not None else -1 * tf.ones([n], dtype=tf.int32)) initial_state = initial_cell_state_from_embedding( self._dec_cell, z, name='decoder/z_to_initial_state') beam_initial_state = seq2seq.tile_batch(initial_state, multiplier=beam_width) # Tile `z` across beams. beam_z = tf.tile(tf.expand_dims(z, 1), [1, beam_width, 1]) def embedding_fn(tokens): # If tokens are the start_tokens (negative), replace with zero vectors. next_inputs = tf.cond( tf.less(tokens[0, 0], 0), lambda: tf.zeros([n, beam_width, self._output_depth]), lambda: tf.one_hot(tokens, self._output_depth)) # Concatenate `z` to next inputs. next_inputs = tf.concat([next_inputs, beam_z], axis=-1) return next_inputs decoder = seq2seq.BeamSearchDecoder(self._dec_cell, embedding_fn, start_tokens, end_token, beam_initial_state, beam_width, output_layer=self._output_layer, length_penalty_weight=0.0) final_output, final_state, _ = seq2seq.dynamic_decode( decoder, maximum_iterations=max_length, swap_memory=True, scope='decoder') # Returns samples and final states from the best beams. return (tf.one_hot(final_output.predicted_ids[:, :, 0], self._output_depth), nest.map_structure(lambda x: x[:, 0], final_state.cell_state))
def model_fn(features, labels, mode, params): embedding_encoder = tf.get_variable('embedding_encoder', shape=(params.vocab_size, params.emb_size)) table = lookup_ops.index_to_string_table_from_file(params.word_vocab_file) question_emb = tf.nn.embedding_lookup(embedding_encoder, features['question_words']) passage_emb = tf.nn.embedding_lookup(embedding_encoder, features['passage_words']) question_words_length = features['question_length'] passage_words_length = features['passage_length'] answer_start, answer_end = features['answer_start'], features['answer_end'] answer_start = tf.concat([tf.expand_dims(answer_start, -1)] * 50, -1) answer_end = tf.concat([tf.expand_dims(answer_end, -1)] * 50, -1) with tf.variable_scope('passage_encoding'): passage_enc, (_, passage_bw_state) = biGRU(tf.concat( [passage_emb, answer_start, answer_end], -1), passage_words_length, params, layers=params.layers) with tf.variable_scope('question_encoding'): question_enc, (_, question_bw_state) = biGRU(question_emb, question_words_length, params, layers=params.layers) # output_enc = masked_concat(question_enc, passage_enc, question_words_length, passage_words_length) decoder_state_layer = Dense(params.units, activation=tf.tanh, use_bias=True, name='decoder_state_init') decoder_init_state = tuple( decoder_state_layer( tf.concat([passage_bw_state[i], question_bw_state[i]], -1)) for i in range(params.layers)) question_att = BahdanauAttention( params.units, question_enc, memory_sequence_length=question_words_length) passage_att = BahdanauAttention( params.units, passage_enc, memory_sequence_length=passage_words_length) decoder_cell = AttentionWrapper(MultiRNNCell( [GRUCell(params.units) for _ in range(params.layers)]), [question_att, passage_att], initial_cell_state=decoder_init_state) batch_size = params.batch_size # if mode != tf.estimator.ModeKeys.PREDICT else 1 if mode == tf.estimator.ModeKeys.TRAIN: answer_emb = tf.nn.embedding_lookup(embedding_encoder, features['answer_words']) helper = TrainingHelper(answer_emb, features['answer_length']) else: helper = GreedyEmbeddingHelper( embedding_encoder, tf.fill([batch_size], params.tgt_sos_id), params.tgt_eos_id) projection_layer = Dense(params.vocab_size, use_bias=False) decoder = SNetDecoder(decoder_cell, helper, decoder_cell.zero_state(batch_size, tf.float32), output_layer=projection_layer, params=params) outputs, _, outputs_length = dynamic_decode( decoder, maximum_iterations=params.answer_max_words) logits = outputs.rnn_output if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'answer': table.lookup(tf.cast(outputs.sample_id, tf.int64)) } export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) # logits = tf.Print(logits, [outputs.sample_id, labels], summarize=1000) labels = tf.stop_gradient(labels[:, :tf.reduce_max(outputs_length)]) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) target_weights = tf.sequence_mask(outputs_length, dtype=logits.dtype) loss = tf.reduce_sum(crossent * target_weights) / params.batch_size if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdadeltaOptimizer(learning_rate=1) global_step = tf.train.get_or_create_global_step() grads = optimizer.compute_gradients(loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip) train_op = optimizer.apply_gradients(zip(capped_grads, variables), global_step=global_step) return EstimatorSpec( mode, loss=loss, train_op=train_op, ) if mode == tf.estimator.ModeKeys.EVAL: return EstimatorSpec(mode, loss=loss, eval_metric_ops={ 'rouge-l': rouge_l(outputs.sample_id, labels, outputs_length, features['answer_length'], params, table), })
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器""" with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') # 训练--train if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def construct(self): self.saved_session_name = os.path.join(self.tmp_folder, self.uuid_code) self.input_data = tf.placeholder(tf.float32, [None, None, self.input_dim]) self.output_data = tf.placeholder(tf.float32, [None, None, self.output_dim]) self.start_tokens = tf.placeholder(tf.float32, [None, self.output_dim]) self.go_tokens = tf.placeholder(tf.float32, [None, 1, self.output_dim]) self.sequence_length = tf.placeholder(tf.int32, [None]) self.mask = tf.placeholder(tf.float32, [None, None]) self.target_sequence_length = tf.placeholder( tf.int32, (None, ), name='target_sequence_length') self.max_target_sequence_length = tf.reduce_max( self.target_sequence_length, name='max_target_len') self.source_sequence_length = tf.placeholder( tf.int32, (None, ), name='source_sequence_length') self.x_stopping = np.full((self.stop_pad_length, self.input_dim), self.stop_pad_token, dtype=np.float32) self.y_stopping = np.full((self.stop_pad_length, self.output_dim), self.stop_pad_token, dtype=np.float32) self.learning_rate = tf.placeholder(tf.float32) self.batch_size = tf.placeholder(tf.float32) enc_cell = make_cell(self.layer_sizes, self.keep_prob) # We want to train the decoder to learn the stopping point as well, # so the sequence lengths is extended for both the decoder and the encoder # logic: the encoder will learn that the stopping token is the signal that the input is finished # the decoder will learn to produce the stopping token to match the expected output # the inferer will learn to produce the stopping token for us to recognise that and stop inferring self.source_sequence_length_padded = self.source_sequence_length + self.stop_pad_length self.target_sequence_length_padded = self.target_sequence_length + self.stop_pad_length max_target_sequence_length_padded = self.max_target_sequence_length + self.stop_pad_length _, self.enc_state = dynamic_rnn( enc_cell, self.input_data, sequence_length=self.source_sequence_length_padded, dtype=tf.float32, time_major=False, swap_memory=True) self.enc_state_centre = self.enc_state[-1] if self.symmetric: self.enc_state = self.enc_state[::-1] dec_cell = make_cell(self.layer_sizes[::-1], self.keep_prob) else: dec_cell = make_cell(self.layer_sizes, self.keep_prob) # 3. Dense layer to translate the decoder's output at each time # step into a choice from the target vocabulary projection_layer = tf.layers.Dense( units=self.output_dim, # kernel_initializer=tf.initializers.he_normal(), # kernel_regularizer=regularizer, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # 4. Set up a training decoder and an inference decoder # Training Decoder with tf.variable_scope("decode"): # During PREDICT mode, the output data is none so we can't have a training model. # Helper for the training process. Used by BasicDecoder to read inputs. dec_input = tf.concat([self.go_tokens, self.output_data], 1) training_helper = TrainingHelper( inputs=dec_input, sequence_length=self.target_sequence_length_padded, time_major=False) # Basic decoder training_decoder = BasicDecoder(dec_cell, training_helper, self.enc_state, projection_layer) # Perform dynamic decoding using the decoder self.training_decoder_output\ = dynamic_decode(training_decoder, # True because we're using variable length sequences, which have finish points impute_finished=True, maximum_iterations=max_target_sequence_length_padded)[0] # 5. Inference Decoder # Reuses the same parameters trained by the training process with tf.variable_scope("decode", reuse=True): def end_fn(time_step_value): # Ideally, the inferer should produce the stopping token # Which can be assessed as being equal to the modelled stop token, and this should be return: # return tf.reduce_all(tf.equal(time_step_value, self.y_stopping)) # However due to the nature of training, the produced stop token will never be exactly the same # as the modelled one. If we use an embedded layer, then this top token can be learned # however as we are not using the embedded layer, this function should return False # meaning there is no early stop return False inference_helper = InferenceHelper(sample_fn=lambda x: x, sample_shape=[self.output_dim], sample_dtype=dtypes.float32, start_inputs=self.start_tokens, end_fn=end_fn) # Basic decoder inference_decoder = BasicDecoder(dec_cell, inference_helper, self.enc_state, projection_layer) # Perform dynamic decoding using the decoder self.inference_decoder_output = dynamic_decode( inference_decoder, # True because we're using variable length sequences, which have finish points impute_finished=True, maximum_iterations=max_target_sequence_length_padded)[0]
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell() # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable(name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True,) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer,) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def _build( self, # pylint: disable=arguments-differ, too-many-statements decoding_strategy='train_greedy', inputs=None, memory=None, memory_sequence_length=None, memory_attention_bias=None, beam_width=None, length_penalty=0., start_tokens=None, end_token=None, context=None, context_sequence_length=None, softmax_temperature=None, max_decoding_length=None, impute_finished=False, embedding=None, helper=None, mode=None): """Performs decoding. The interface is mostly the same with that of RNN decoders (see :meth:`~texar.modules.RNNDecoderBase._build`). The main difference is that, here, `sequence_length` is not needed, and continuation generation is additionally supported. The function provides **3 ways** to specify the decoding method, with varying flexibility: 1. The :attr:`decoding_strategy` argument. - **"train_greedy"**: decoding in teacher-forcing fashion (i.e., feeding ground truth to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Argument :attr:`inputs` is required for this strategy. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding `generated` sample to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and for each step sample is obtained by `random sampling` from the logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. This argument is used only when arguments :attr:`helper` and :attr:`beam_width` are both `None`. 2. The :attr:`helper` argument: An instance of subclass of :class:`texar.modules.Helper`. This provides a superset of decoding strategies than above. The interface is the same as in RNN decoders. Please refer to :meth:`texar.modules.RNNDecoderBase._build` for detailed usage and examples. Note that, here, though using a :class:`~texar.modules.TrainingHelper` corresponds to the "train_greedy" strategy above and will get the same output results, the implementation is *slower* than directly setting `decoding_strategy="train_greedy"`. Argument :attr:`max_decoding_length` is optional. 3. **Beam search**: set :attr:`beam_width` to use beam search decoding. Arguments :attr:`(start_tokens, end_token)` are required, and argument :attr:`max_decoding_length` is optional. Args: memory (optional): The memory to attend, e.g., the output of an RNN encoder. A Tensor of shape `[batch_size, memory_max_time, dim]`. memory_sequence_length (optional): A Tensor of shape `[batch_size]` containing the sequence lengths for the batch entries in memory. Used to create attention bias of :attr:`memory_attention_bias` is not given. Ignored if `memory_attention_bias` is provided. memory_attention_bias (optional): A Tensor of shape `[batch_size, num_heads, memory_max_time, dim]`. An attention bias typically sets the value of a padding position to a large negative value for masking. If not given, :attr:`memory_sequence_length` is used to automatically create an attention bias. inputs (optional): Input tensor for teacher forcing decoding, of shape `[batch_size, target_max_time, emb_dim]` containing the target sequence word embeddings. Used when :attr:`decoding_strategy` is set to "train_greedy". decoding_strategy (str): A string specifying the decoding strategy, including "train_greedy", "infer_greedy", "infer_sample". Different arguments are required based on the strategy. See above for details. Ignored if :attr:`beam_width` or :attr:`helper` is set. beam_width (int): Set to use beam search. If given, :attr:`decoding_strategy` is ignored. length_penalty (float): Length penalty coefficient used in beam search decoding. Refer to https://arxiv.org/abs/1609.08144 for more details. It Should be larger if longer sentences are wanted. start_tokens (optional): An int Tensor of shape `[batch_size]`, containing the start tokens. Used when :attr:`decoding_strategy` = "infer_greedy" or "infer_sample", or :attr:`beam_width` is set. Ignored if :attr:`context` is given. end_token (optional): An int 0D Tensor, the token that marks end of decoding. Used when :attr:`decoding_strategy` = "infer_greedy" or "infer_sample", or :attr:`beam_width` is set. context (optional): An int Tensor of shape `[batch_size, length]`, containing the starting tokens for decoding. If context is set, :attr:`start_tokens` will be ignored. context_sequence_length (optional): specify the length of context. softmax_temperature (optional): A float 0D Tensor, value to divide the logits by before computing the softmax. Larger values (above 1.0) result in more random samples. Must > 0. If `None`, 1.0 is used. Used when :attr:`decoding_strategy` = "infer_sample"`. max_decoding_length (optional): An int scalar Tensor indicating the maximum allowed number of decoding steps. If `None` (default), use "max_decoding_length" defined in :attr:`hparams`. Ignored in "train_greedy" decoding. impute_finished (bool): If `True`, then states for batch entries which are marked as finished get copied through and the corresponding outputs get zeroed out. This causes some slowdown at each time step, but ensures that the final state and outputs have the correct values and that backprop ignores time steps that were marked as finished. Ignored in "train_greedy" decoding. embedding (optional): Embedding used when "infer_greedy" or "infer_sample" `decoding_strategy`, or beam search, is used. This can be a callable or the `params` argument for :tf_main:`embedding_lookup <nn/embedding_lookup>`. If a callable, it can take a vector tensor of token `ids`, or take two arguments (`ids`, `times`), where `ids` is a vector tensor of token ids, and `times` is a vector tensor of time steps (i.e., position ids). The latter case can be used when attr:`embedding` is a combination of word embedding and position embedding. helper (optional): An instance of :tf_main:`Helper <contrib/seq2seq/Helper>` that defines the decoding strategy. If given, :attr:`decoding_strategy` is ignored. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Controls dropout mode. If `None` (default), :func:`texar.global_mode` is used. Returns: - For **"train_greedy"** decoding, returns an instance of \ :class:`~texar.modules.TransformerDecoderOutput` which contains\ `sample_id` and `logits`. - For **"infer_greedy"** and **"infer_sample"** decoding or\ decoding with :attr:`helper`, returns\ a tuple `(outputs, sequence_lengths)`, where `outputs` is an \ instance of :class:`~texar.modules.TransformerDecoderOutput` as\ in "train_greedy", and `sequence_lengths` is a Tensor of shape\ `[batch_size]` containing the length of each sample. - For **beam search** decoding, returns a `dict` containing keys\ "sample_id" and "log_prob". - **"sample_id"** is an int Tensor of shape \ `[batch_size, max_time, beam_width]` containing generated\ token indexes. `sample_id[:,:,0]` is the highest-probable \ sample. - **"log_prob"** is a float Tensor of shape \ `[batch_size, beam_width]` containing the log probability \ of each sequence sample. """ if memory is not None: if memory_attention_bias is None: if memory_sequence_length is None: raise ValueError("`memory_sequence_length` is required if " "`memory_attention_bias` is not given.") enc_padding = 1 - tf.sequence_mask(memory_sequence_length, shape_list(memory)[1], dtype=tf.float32) memory_attention_bias = attn.attention_bias_ignore_padding( enc_padding) # context will be used in step function for dynamic_decode if context is not None: start_tokens = context[:, 0] self.context = context[:, 1:] self.context_sequence_length = context_sequence_length - 1 else: self.context = None self.embedding = embedding if helper is None and beam_width is None and \ decoding_strategy == 'train_greedy': # Teacher-forcing decoder_self_attention_bias = (attn.attention_bias_lower_triangle( shape_list(inputs)[1])) decoder_output = self._self_attention_stack( inputs, memory, decoder_self_attention_bias=decoder_self_attention_bias, memory_attention_bias=memory_attention_bias, cache=None, mode=mode) logits = self._output_layer(decoder_output) preds = tf.to_int32(tf.argmax(logits, axis=-1)) rets = TransformerDecoderOutput(logits=logits, sample_id=preds) else: if max_decoding_length is None: max_decoding_length = self._hparams.max_decoding_length self.max_decoding_length = max_decoding_length if beam_width is None: # Inference-like decoding # Prepare helper if helper is None: if decoding_strategy == "infer_greedy": helper = tx_helper.GreedyEmbeddingHelper( embedding, start_tokens, end_token) elif decoding_strategy == "infer_sample": helper = tx_helper.SampleEmbeddingHelper( embedding, start_tokens, end_token, softmax_temperature) else: raise ValueError( "Unknown decoding strategy: {}".format( decoding_strategy)) self._helper = helper self._cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=False) if context is not None: # To avoid out-of-range in `step` paddings = [[0, 0] for _ in range(get_rank(self.context))] paddings[1][1] = \ max_decoding_length - shape_list(self.context)[1] self.context = tf.pad(self.context, paddings=paddings) outputs, _, sequence_lengths = dynamic_decode( decoder=self, impute_finished=impute_finished, maximum_iterations=max_decoding_length, output_time_major=False, scope=self.variable_scope) if context is not None: # Here the length of sample_id will be larger than that # of logit by 1, because there will be a additional # start_token in the returned sample_id. # the start_id should be the first token of the # given context outputs = TransformerDecoderOutput( logits=outputs.logits, sample_id=tf.concat([ tf.expand_dims(start_tokens, 1), outputs.sample_id ], axis=1)) sequence_lengths = sequence_lengths + 1 rets = outputs, sequence_lengths else: # Beam-search decoding # Ignore `decoding_strategy`; Assume `helper` is not set if helper is not None: raise ValueError("Must not set 'beam_width' and 'helper' " "simultaneously.") _batch_size = shape_list(start_tokens)[0] self._cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=True, batch_size=_batch_size) # The output format is different when running beam search sample_id, log_prob = self._beam_decode( start_tokens, end_token, beam_width=beam_width, length_penalty=length_penalty, decode_length=max_decoding_length, ) rets = {'sample_id': sample_id, 'log_prob': log_prob} if not self._built: self._add_internal_trainable_variables() self._built = True return rets
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell() output_layer = Dense(self.num_symbols, name='output_projection') start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token end_token = data_utils.end_token helper = GumbelSoftmaxEmbeddingHelper(embedding=self.embeddings, start_tokens=start_tokens,end_token= end_token, tau=self.tau) max_decoder_length = tf.reduce_max(self.encoder_inputs_length) decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell, helper=helper, initial_state=self.decoder_initial_state)#, output_layer=output_layer) (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_decoder_length,impute_finished=True)) self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)#IMPORTANT self.decoder_pred_decode = tf.argmax(self.decoder_outputs_train.sample_id, axis=-1, output_type=tf.int32)#IMPORTANT #newintput = data_utils.insertSequence(self.decoder_pred_decode.eval(), self.encoder_inputs.eval(),1, self.total_num) ''' _loss = 0 for i in range(self.detector.batch_size): source, source_len = data_utils.prepare_batch(newintput[i:i*self.detector.batch_size], self.detector.stride, self.detector.maxlen, self.detector.batch_size) _, logits = self.detector.predict(self.sess, source, source_len) _loss += logits[0] - logits[1] ''' self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.encoder_inputs, self.decoder_pred_decode), self.dtype)) masks = tf.sequence_mask(lengths=self.encoder_inputs_length, maxlen=max_decoder_length, dtype=self.dtype, name='masks') self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.encoder_inputs, weights=masks) #self.loss = _loss + np.sum(self.decoder_pred_decode**masks**2)/np.sum(masks)/2 tf.summary.scalar('loss', self.loss) self.init_optimizer()
def _test_beam_search(self, decoder, initial_state=None, tiled_initial_state=None, tf_initial_state=None, beam_width_1=1, initiated=False): # Compare with tf built-in BeamSearchDecoder outputs, final_state, _ = beam_search_decode(decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=beam_width_1, max_decoding_length=20) self.assertIsInstance(outputs, tf.contrib.seq2seq.FinalBeamSearchDecoderOutput) self.assertIsInstance(final_state, tf.contrib.seq2seq.BeamSearchDecoderState) num_trainable_variables = len(tf.trainable_variables()) _ = decoder(decoding_strategy='infer_greedy', embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, max_decoding_length=20) self.assertEqual(num_trainable_variables, len(tf.trainable_variables())) if tf_initial_state is None: tf_initial_state = decoder.cell.zero_state( self._batch_size * beam_width_1, tf.float32) beam_decoder = BeamSearchDecoder(cell=decoder.cell, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, initial_state=tf_initial_state, beam_width=beam_width_1, output_layer=decoder.output_layer) outputs_1, final_state_1, _ = dynamic_decode(decoder=beam_decoder, maximum_iterations=20) ## Tests time major outputs_2, _, _ = beam_search_decode( decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=self._beam_width, initial_state=initial_state, tiled_initial_state=tiled_initial_state, max_decoding_length=21) outputs_3, _, _ = beam_search_decode( decoder_or_cell=decoder, embedding=self._embedding, start_tokens=[1] * self._batch_size, end_token=2, beam_width=self._beam_width, initial_state=initial_state, tiled_initial_state=tiled_initial_state, max_decoding_length=21, output_time_major=True) with self.test_session() as sess: if not initiated: sess.run(tf.global_variables_initializer()) outputs_, final_state_, outputs_1_, final_state_1_ = sess.run( [outputs, final_state, outputs_1, final_state_1], feed_dict={ context.global_mode(): tf.estimator.ModeKeys.PREDICT }) np.testing.assert_array_equal(outputs_.predicted_ids, outputs_1_.predicted_ids) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.scores, outputs_1_.beam_search_decoder_output.scores) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.predicted_ids, outputs_1_.beam_search_decoder_output.predicted_ids) np.testing.assert_array_equal( outputs_.beam_search_decoder_output.parent_ids, outputs_1_.beam_search_decoder_output.parent_ids) np.testing.assert_array_equal(final_state_.log_probs, final_state_1_.log_probs) np.testing.assert_array_equal(final_state_.lengths, final_state_1_.lengths) outputs_2_, outputs_3_ = sess.run([outputs_2, outputs_3], feed_dict={ context.global_mode(): tf.estimator.ModeKeys.PREDICT }) self.assertEqual(outputs_2_.predicted_ids.shape, tuple([self._batch_size, 21, 11])) self.assertEqual(outputs_3_.predicted_ids.shape, tuple([21, self._batch_size, 11]))
def sample(self, n, max_length=None, z=None, temperature=None, start_inputs=None, beam_width=None, end_token=None): """Overrides BaseLstmDecoder `sample` method to add optional beam search. Args: n: Scalar number of samples to return. max_length: (Optional) Scalar maximum sample length to return. Required if data representation does not include end tokens. z: (Optional) Latent vectors to sample from. Required if model is conditional. Sized `[n, z_size]`. temperature: (Optional) The softmax temperature to use when not doing beam search. Defaults to 1.0. Ignored when `beam_width` is provided. start_inputs: (Optional) Initial inputs to use for batch. Sized `[n, output_depth]`. beam_width: (Optional) Width of beam to use for beam search. Beam search is disabled if not provided. end_token: (Optional) Scalar token signaling the end of the sequence to use for early stopping. Returns: samples: Sampled sequences. Sized `[n, max_length, output_depth]`. Raises: ValueError: If `z` is provided and its first dimension does not equal `n`. """ if beam_width is None: end_fn = (None if end_token is None else lambda x: tf.equal(tf.argmax(x, axis=-1), end_token)) return super(CategoricalLstmDecoder, self).sample( n, max_length, z, temperature, start_inputs, end_fn) # If `end_token` is not given, use an impossible value. end_token = self._output_depth if end_token is None else end_token if z is not None and z.shape[0].value != n: raise ValueError( '`z` must have a first dimension that equals `n` when given. ' 'Got: %d vs %d' % (z.shape[0].value, n)) if temperature is not None: tf.logging.warning('`temperature` is ignored when using beam search.') # Use a dummy Z in unconditional case. z = tf.zeros((n, 0), tf.float32) if z is None else z # If not given, start with dummy `-1` token and replace with zero vectors in # `embedding_fn`. start_tokens = ( tf.argmax(start_inputs, axis=-1, output_type=tf.int32) if start_inputs is not None else -1 * tf.ones([n], dtype=tf.int32)) initial_state = initial_cell_state_from_embedding( self._dec_cell, z, name='decoder/z_to_initial_state') beam_initial_state = seq2seq.tile_batch( initial_state, multiplier=beam_width) # Tile `z` across beams. beam_z = tf.tile(tf.expand_dims(z, 1), [1, beam_width, 1]) def embedding_fn(tokens): # If tokens are the start_tokens (negative), replace with zero vectors. next_inputs = tf.cond( tf.less(tokens[0, 0], 0), lambda: tf.zeros([n, beam_width, self._output_depth]), lambda: tf.one_hot(tokens, self._output_depth)) # Concatenate `z` to next inputs. next_inputs = tf.concat([next_inputs, beam_z], axis=-1) return next_inputs decoder = seq2seq.BeamSearchDecoder( self._dec_cell, embedding_fn, start_tokens, end_token, beam_initial_state, beam_width, output_layer=self._output_layer, length_penalty_weight=0.0) final_output, _, _ = seq2seq.dynamic_decode( decoder, maximum_iterations=max_length, swap_memory=True, scope='decoder') return tf.one_hot( final_output.predicted_ids[:, :, 0], self._output_depth)
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))