def _build_bidirectional_rnn(self, inputs, dtype, hparams, num_bi_layers): # Construct forward and backward cells. #each one has num_bi_layers layers. Each layer has num_units. fw_cell = model_helper.create_rnn_cell( hparams.unit_type, hparams.num_units, num_bi_layers, hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode) bw_cell = model_helper.create_rnn_cell( hparams.unit_type, hparams.num_units, num_bi_layers, hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode) # initial_state_fw, initial_state_bw are initialized to 0 # bi_outputs is a tuple (output_fw, output_bw) containing the forward and the backward rnn output Tensor # bi_state is a tuple (output_state_fw, output_state_bw) with the forward and the backward final states. # Each state has num_units. # num_bi_layers>1, we have a list of num_bi_layers tuples. bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs, dtype=dtype, sequence_length=self.input_sequence_length, time_major=self.time_major) # return fw and bw outputs,i.e., ([h1_fw;h1_bw],...,[hT_fw;hT_bw]) concatenated. return tf.concat(bi_outputs, -1), bi_state
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build an RNN cell that can be used by decoder.""" # We only make use of encoder_outputs in attention-based models if hparams.attention: raise ValueError("BasicModel doesn't support attention.") cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=self.num_decoder_layers, num_residual_layers=self.num_decoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=self.num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn) # For beam search, we need to replicate encoder infos beam_width times if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0: decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) else: decoder_initial_state = encoder_state return cell, decoder_initial_state
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length, base_gpu=0): """Build an RNN cell that can be used by decoder.""" # We only make use of encoder_outputs in attention-based models if hparams.attention: raise ValueError("BasicModel doesn't support attention.") cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=self.num_units, num_layers=self.num_decoder_layers, num_residual_layers=self.num_decoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=self.num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn, base_gpu=base_gpu ) if hparams.language_model: encoder_state = cell.zero_state(self.batch_size, self.dtype) elif not hparams.pass_hidden_state: raise ValueError("For non-attentional model, " "pass_hidden_state needs to be set to True") # For beam search, we need to replicate encoder infos beam_width times if (self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.infer_mode == "beam_search"): decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) else: decoder_initial_state = encoder_state return cell, decoder_initial_state
def _build_all_encoder_layers(self, bi_encoder_outputs, num_uni_layers, dtype, hparams): """Build encoder layers all at once.""" uni_cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_uni_layers, num_residual_layers=self.num_encoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=self.num_gpus, base_gpu=1, mode=self.mode, single_cell_fn=self.single_cell_fn) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( uni_cell, bi_encoder_outputs, dtype=dtype, sequence_length=self.iterator.source_sequence_length, time_major=self.time_major) # Use the top layer for now self.encoder_state_list = [encoder_outputs] return encoder_state, encoder_outputs
def _build_all_encoder_layers(self, bi_encoder_outputs, num_uni_layers, dtype, hparams): """Build encoder layers all at once.""" uni_cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_uni_layers, num_residual_layers=self.num_encoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, single_cell_fn=self.single_cell_fn, global_step=self.global_step) if hparams.use_dynamic_rnn: encoder_outputs, encoder_state = tf.nn.dynamic_rnn( uni_cell, bi_encoder_outputs, dtype=dtype, sequence_length=self.features["source_sequence_length"], time_major=self.time_major) else: encoder_outputs, encoder_state = tf.contrib.recurrent.functional_rnn( uni_cell, bi_encoder_outputs, dtype=dtype, sequence_length=self.features["source_sequence_length"], time_major=self.time_major, use_tpu=hparams.use_tpu) # Use the top layer for now self.encoder_state_list = [encoder_outputs] return encoder_state, encoder_outputs
def _build_encoder_cell(self, hparams, num_layers): return model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, single_cell_fn=self.single_cell_fn)
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" attention_option = hparams.attention attention_architecture = hparams.attention_architecture if attention_architecture != "standard": raise ValueError( "Unknown attention architecture %s" % attention_architecture) num_units = hparams.num_units num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers beam_width = hparams.beam_width dtype = tf.float32 if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width) source_sequence_length = tf.contrib.seq2seq.tile_batch(source_sequence_length, multiplier=beam_width) encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=beam_width) batch_size = self.batch_size * beam_width else: batch_size = self.batch_size attention_mechanism = create_attention_mechanism(attention_option, num_units, memory, source_sequence_length) cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, base_gpu=hparams.base_gpu, mode=self.mode, single_cell_fn=self.single_cell_fn) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width == 0) cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, name="attention") cell = tf.contrib.rnn.DeviceWrapper(cell, model_helper.get_device_str(hparams.base_gpu)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state(batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state
def _build_encoder_cell(self, hparams, num_layers, num_residual_layers, base_gpu=0): """Build a multi-layer RNN cell that can be used by encoder.""" if hparams.model == 'model3': if hparams.mann == 'ntm': return NTMCell(hparams.num_layers, hparams.num_units, use_att_memory=False, att_memory=False, att_memory_size=None, att_memory_vector_dim=None, use_ext_memory=True, ext_memory_size=hparams.num_memory_locations, ext_memory_vector_dim=hparams.memory_unit_size, ext_read_head_num=hparams.read_heads, ext_write_head_num=hparams.write_heads, dropout=hparams.dropout, batch_size=hparams.batch_size, mode=self.mode, shift_range=1, output_dim=hparams.num_units, reuse=False, record_w_history=hparams.record_w_history) elif hparams.mann == 'dnc': access_config = { 'memory_size': hparams.num_memory_locations, 'word_size': hparams.memory_unit_size, 'num_reads': hparams.read_heads, 'num_writes': hparams.write_heads } controller_config = { 'num_units': hparams.num_units, 'num_layers': hparams.num_layers } return DNC(access_config, controller_config, hparams.num_units, 20, hparams.dropout, self.mode, hparams.batch_size) else: return model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=self.mode, base_gpu=base_gpu, single_cell_fn=self.single_cell_fn, num_proj=None)
def create_rnn_cell(self): "create rnn cell" return model_helper.create_rnn_cell( num_layers=self.rnn_num_layers, cell_type=self.cell_type, num_units=self.num_units, dropout=self.dropout, mode=self.mode, residual_connect=self.residual_connect, residual_fn=None, num_gpus=self.num_gpus)
def _build_encoder_cell(self, hparams, num_layers, base_gpu=0): """Build a multi-layer RNN cell that can be used by encoder.""" return model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, base_gpu=base_gpu, single_cell_fn=self.single_cell_fn)
def _build_encoder_cell(self, hparams, num_layers, num_residual_layers, dtype=None): """Build a multi-layer RNN cell that can be used by encoder.""" return model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=self.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, dtype=dtype, single_cell_fn=self.single_cell_fn, use_block_lstm=hparams.use_block_lstm)
def _build_encoder_cell(model, hparams, num_layers, num_residual_layers, base_gpu=0, all_layer_outputs=False): """multi rnn cell for the seq2seq encoder.""" return model_helper.create_rnn_cell( num_units=hparams.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=model.mode, base_gpu=base_gpu, single_cell_fn=model.single_cell_fn, all_layer_outputs=all_layer_outputs)
def _build_rnn(self, hparams): if self.time_major: self.inputs = tf.transpose(self.inputs) emb_inp = tf.nn.embedding_lookup(self.input_embedding, self.inputs) last_hidden_sate = [] # RNN outputs: [max_time, batch_size, num_units] with tf.variable_scope("rnn") as scope: dtype = scope.dtype # Look up embedding, emb_imp: [max_time, batch_size, num_units] if hparams.rnn_type == "uni": cell = model_helper.create_rnn_cell( hparams.unit_type, hparams.num_units, hparams.num_layers, hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode) # encoder_state --> a Tensor of shape `[batch_size, cell.state_size]` or a list of such Tensors for many layers _, last_hidden_sate = tf.nn.dynamic_rnn( cell, emb_inp, dtype=dtype, sequence_length=self.input_sequence_length, time_major=self.time_major) elif hparams.rnn_type == "bi": num_bi_layers = int(hparams.num_layers / 2) print("num_bi_layers %d" % num_bi_layers) _, bi_last_hidden_state = self._build_bidirectional_rnn( emb_inp, dtype, hparams, num_bi_layers) # if the encoder has 1 layer per bi-rnn, it means that it has 1 fwd and 1 bwd layers -> in total it has 2 layers. # and every fwd and bwd layer has enc_units each -> in total 2*enc_units if num_bi_layers == 1: last_hidden_sate = bi_last_hidden_state else: # alternatively concat forward and backward states last_hidden_sate = [] for layer_id in range(num_bi_layers): # bi_encoder_state[0] are all the enc_layers/2 fwd states. last_hidden_sate.append( bi_last_hidden_state[0][layer_id]) # forward # bi_encoder_state[1] are all the enc_layers/2 bwd states. last_hidden_sate.append( bi_last_hidden_state[1][layer_id]) # backward last_hidden_sate = tuple(last_hidden_sate) else: raise ValueError("Unknown rnn type: %s" % hparams.rnn_type) return last_hidden_sate
def build_rnn(self, hparams): # Look up embedding: emb_imp.shape = [batch_size, max_seq_length, num_units] emb_inp = tf.nn.embedding_lookup(self.input_embedding, self.inputs) # rnn_outputs.shape = [batch_size, max_seq_length, num_units] with tf.variable_scope("rnn") as scope: dtype = scope.dtype cell = model_helper.create_rnn_cell( hparams.unit_type, hparams.num_units, hparams.num_layers, hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode) # last_hidden_sate --> a Tensor of shape [batch_size, num_units] or a list of such Tensors for many layers # rnn_outputs --> a Tensor of shape [batch_size, max_seq_length, num_units]. # sequence_length: It is used to zero-out outputs when past a batch element's true sequence length. rnn_outputs, last_hidden_sate = tf.nn.dynamic_rnn( cell, emb_inp, dtype=dtype, sequence_length=self.input_sequence_length) return rnn_outputs, last_hidden_sate
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_decoder_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, single_cell_fn=self.single_cell_fn) # For beam search, we need to replicate encoder infos beam_width times if self.mode == tf.estimator.ModeKeys.PREDICT and hparams.beam_width > 0: decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) else: decoder_initial_state = encoder_state return cell, decoder_initial_state
def _build_encoder_cell(self, hparams, num_layers, num_residual_layers, fast_reverse=False, reverse=False): """Build a multi-layer RNN cell that can be used by encoder.""" mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_DROPOUT, value=hparams.dropout) return model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=self.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, single_cell_fn=self.single_cell_fn, global_step=self.global_step, fast_reverse=fast_reverse, seq_len=self.features["source_sequence_length"] if reverse else None)
def _build_action_decoder_cell(model, hparams, encoder_state, base_gpu): """decoder cell constructor for action states.""" num_residual_layers = hparams.num_residual_layers cell = model_helper.create_rnn_cell( num_units=hparams.num_units, num_layers=1, num_residual_layers=num_residual_layers, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=model.mode, single_cell_fn=model.single_cell_fn, base_gpu=base_gpu) # For beam search, we need to replicate encoder infos beam_width times if model.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0: decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state[-1], multiplier=hparams.beam_width) else: decoder_initial_state = encoder_state[-1] return cell, decoder_initial_state
def _build_decoder_cell(model, hparams, encoder_state, base_gpu): """multi rnn cell for the seq2seq decoder.""" num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers cell = model_helper.create_rnn_cell( num_units=hparams.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=model.mode, single_cell_fn=model.single_cell_fn, base_gpu=base_gpu) # For beam search, we need to replicate encoder infos beam_width times if model.mode == tf.estimator.ModeKeys.PREDICT and hparams.beam_width > 0: decoder_initial_state = seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) else: decoder_initial_state = encoder_state return cell, decoder_initial_state
def _build_encoder(self, hparams): """Build a GNMT encoder.""" if hparams.encoder_type == "uni" or hparams.encoder_type == "bi": return super(GNMTModel, self)._build_encoder(hparams) if hparams.encoder_type != "gnmt": raise ValueError("Unknown encoder_type %s" % hparams.encoder_type) # Build GNMT encoder. num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers num_bi_layers = 1 num_uni_layers = num_layers - num_bi_layers utils.print_out(" num_bi_layers = %d" % num_bi_layers) utils.print_out(" num_uni_layers = %d" % num_uni_layers) iterator = self.iterator source = iterator.source if self.time_major: source = tf.transpose(source) with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] # when time_major = True encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, source) # Execute _build_bidirectional_rnn from Model class bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn( inputs=encoder_emb_inp, sequence_length=iterator.source_sequence_length, dtype=dtype, hparams=hparams, num_bi_layers=num_bi_layers, num_bi_residual_layers=0, # no residual connection ) uni_cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_uni_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=hparams.num_gpus, base_gpu=1, mode=self.mode, single_cell_fn=self.single_cell_fn) # encoder_outputs: size [max_time, batch_size, num_units] # when time_major = True encoder_outputs, encoder_state = tf.nn.dynamic_rnn( uni_cell, bi_encoder_outputs, dtype=dtype, sequence_length=iterator.source_sequence_length, time_major=self.time_major) # Pass all encoder state except the first bi-directional layer's state to # decoder. encoder_state = (bi_encoder_state[1], ) + ( (encoder_state, ) if num_uni_layers == 1 else encoder_state) return encoder_outputs, encoder_state
def _build_encoder_layers_unidi(self, inputs, sequence_length, num_uni_layers, hparams, dtype): """Build encoder layers all at once.""" encoder_outputs = None encoder_state = tuple() if hparams.use_fused_lstm: for i in range(num_uni_layers): if (not np.isclose(hparams.dropout, 0.) and self.mode == tf.contrib.learn.ModeKeys.TRAIN): cell_inputs = tf.nn.dropout(inputs, keep_prob=1-hparams.dropout) else: cell_inputs = inputs cell = block_lstm.LSTMBlockFusedCell( hparams.num_units, hparams.forget_bias, dtype=dtype) encoder_outputs, (final_c, final_h) = cell( cell_inputs, dtype=dtype, sequence_length=sequence_length) encoder_state += (tf.nn.rnn_cell.LSTMStateTuple(final_c, final_h),) if i >= num_uni_layers - self.num_encoder_residual_layers: # Add the pre-dropout inputs. Residual wrapper is applied after # dropout wrapper. encoder_outputs += inputs inputs = encoder_outputs elif hparams.use_cudnn_lstm: # Single layer cudnn rnn, dropout isnt applied in the kernel for i in range(num_uni_layers): if (not np.isclose(hparams.dropout, 0.) and self.mode == tf.contrib.learn.ModeKeys.TRAIN): inputs = tf.nn.dropout(inputs, keep_prob=1-hparams.dropout) encoder_outputs, encoder_states = self._build_unidi_rnn_cudnn( inputs, None, # initial_state sequence_length, dtype, hparams, 1, # num_layer is_fwd=True) encoder_state += (tf.nn.rnn_cell.LSTMStateTuple(encoder_states.c, encoder_states.h),) if i >= num_uni_layers - self.num_encoder_residual_layers: encoder_outputs += inputs inputs = encoder_outputs else: uni_cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_uni_layers, num_residual_layers=self.num_encoder_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, dtype=dtype, mode=self.mode, single_cell_fn=self.single_cell_fn, use_block_lstm=hparams.use_block_lstm) if hparams.use_dynamic_rnn: encoder_outputs, encoder_state = tf.nn.dynamic_rnn( uni_cell, inputs, dtype=dtype, sequence_length=sequence_length, time_major=self.time_major) else: encoder_outputs, encoder_state = tf.contrib.recurrent.functional_rnn( uni_cell, inputs, dtype=dtype, sequence_length=sequence_length, time_major=self.time_major, use_tpu=False) return encoder_state, encoder_outputs
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" # No Attention if not self.has_attention: return super(AttentionModel, self)._build_decoder_cell(hparams, encoder_outputs, encoder_state, source_sequence_length) elif hparams.attention_architecture != "standard": raise ValueError("Unknown attention architecture %s" % hparams.attention_architecture) num_units = hparams.num_units num_layers = self.num_decoder_layers #num_residual_layers = self.num_decoder_residual_layers infer_mode = hparams.infer_mode dtype = tf.float32 # Ensure memory is batch-major if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if (self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode == "beam_search"): memory, source_sequence_length, encoder_state, batch_size = ( self._prepare_beam_search_decoder_inputs( hparams.beam_width, memory, source_sequence_length, encoder_state)) else: batch_size = self.batch_size # Attention attention_mechanism = self.attention_mechanism_fn( hparams.attention, num_units, memory, source_sequence_length, self.mode) cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=self.num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode != "beam_search") cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, output_attention=hparams.output_attention, name="attention") # TODO(thangluong): do we need num_layers, num_gpus? cell = tf.contrib.rnn.DeviceWrapper( cell, model_helper.get_device_str(num_layers - 1, self.num_gpus)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state( batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state
def _build_graph(self, hparams, scope=None): """Construct the train, evaluation, and inference graphs. Args: hparams: The hyperparameters for configuration scope: The variable scope name for this subgraph, default "dynamic_seq2seq" Returns: A tuple with (logits, loss, metrics, update_ops) """ enc_inputs, dec_inputs, dec_outputs, seq_len = self.iterator.get_next() # get the size of the batch batch_size = tf.shape(enc_inputs)[0] with tf.variable_scope(scope or "dynamic_seq2seq", dtype=tf.float32): # create encoder dense_input_layer = tf.layers.Dense(hparams.num_units, use_bias=False) if hparams.dense_input: enc_inputs = dense_input_layer(enc_inputs) enc_cells = mdl_help.create_rnn_cell(unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_layers, depth=hparams.depth, num_residual_layers=hparams.num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, use_highway_as_residual=hparams.use_highway_as_residual) # run encoder enc_outputs, enc_state = tf.nn.dynamic_rnn(cell=enc_cells, inputs=enc_inputs, sequence_length=seq_len, swap_memory=True, dtype=tf.float32, scope="encoder") tgt_seq_len = tf.add(seq_len, tf.constant(1, tf.int32)) # TODO: Add Inference decoder # create decoder dec_cells = mdl_help.create_rnn_cell(unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_layers, depth=hparams.depth, num_residual_layers=hparams.num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, mode=self.mode, use_highway_as_residual=hparams.use_highway_as_residual) # decoder embedding decoder_embedding = tf.get_variable("decoder_embedding", [hparams.num_labels, hparams.num_units]) if hparams.dense_input: # convert to int32 argmax values for embedding to work dec_inputs = tf.argmax(dec_inputs, axis=-1, output_type=tf.int32) dec_inputs = tf.nn.embedding_lookup(decoder_embedding, dec_inputs) # output project layer projection_layer = tf.layers.Dense(hparams.num_labels, use_bias=False) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: if hparams.train_helper == "teacher": # teacher forcing helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_inputs, sequence_length=tgt_seq_len) elif hparams.train_helper == "sched": if hparams.dense_input: embedding = decoder_embedding else: embedding = tf.eye(hparams.num_labels) # scheduled sampling helper = tf.contrib.seq2seq.\ ScheduledEmbeddingTrainingHelper(inputs=dec_inputs, sequence_length=tgt_seq_len, embedding=embedding, sampling_probability=self.sample_probability, ) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: if hparams.dense_input: embedding = decoder_embedding else: embedding = tf.eye(hparams.num_labels) helper = tf.contrib.seq2seq.\ ScheduledEmbeddingTrainingHelper(inputs=dec_inputs, sequence_length=tgt_seq_len, embedding=embedding, sampling_probability=tf.constant(1.0)) decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cells, helper=helper, initial_state=enc_state, output_layer=projection_layer) # run decoder final_outputs, final_states, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, impute_finished=True, swap_memory=True, scope="decoder") logits = final_outputs.rnn_output # mask out entries longer than target sequence length mask = tf.sequence_mask(tgt_seq_len, dtype=tf.float32) #stop gradient thru labels by crossent op labels = tf.stop_gradient(dec_outputs) crossent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels, name="crossent") # loss = (tf.reduce_sum(crossent*mask)/(hparams.batch_size*tf.reduce_mean(tf.cast(tgt_seq_len, # tf.float32)))) loss = tf.reduce_sum((crossent * mask) / tf.expand_dims( tf.expand_dims(tf.cast(tgt_seq_len, tf.float32), -1), -1)) / tf.cast(batch_size, tf.float32) metrics = [] update_ops = [] if self.mode == tf.contrib.learn.ModeKeys.EVAL: predictions = tf.argmax(input=logits, axis=-1) targets = tf.argmax(input=dec_outputs, axis=-1) acc, acc_update = tf.metrics.accuracy(predictions=predictions, labels=targets, weights=mask) # flatten for confusion matrix targets_flat = tf.reshape(targets, [-1]) predictions_flat = tf.reshape(predictions, [-1]) mask_flat = tf.reshape(mask, [-1]) cm, cm_update = streaming_confusion_matrix(labels=targets_flat, predictions=predictions_flat, num_classes=hparams.num_labels, weights=mask_flat) tf.add_to_collection("eval", cm_summary(cm, hparams.num_labels)) metrics = [acc, cm] update_ops = [acc_update, cm_update] return logits, loss, metrics, update_ops
def _build_graph(self, hparams, scope=None): """Construct the train, evaluation, and inference graphs. Args: hparams: The hyperparameters for configuration scope: The variable scope name for this subgraph Returns: A tuple with (logits, loss, metrics, update_ops) """ sample = self.iterator.get_next() inputs, tgt_outputs, seq_len = sample with tf.variable_scope(scope or "dynamic_bdrnn", dtype=tf.float32): # TODO: hidden activations are passed thru FC net # TODO: hidden-to-hidden network has skip connections (residual) # TODO: initial hidden and cell states are learned # create bdrnn fw_cells = mdl_help.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_layers, depth=0, num_residual_layers=0, forget_bias=hparams.forget_bias, dropout=0., mode=self.mode, num_gpus=1, base_gpu=0) bw_cells = mdl_help.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=hparams.num_layers, depth=0, num_residual_layers=0, forget_bias=hparams.forget_bias, dropout=0., mode=self.mode, num_gpus=1, base_gpu=0) # print(fw_cells.zero_state(1, dtype=tf.float32)) # initial_fw_state = tf.get_variable("initial_fw_state", shape=fw_cells.state_size) # initial_bw_state = tf.get_variable("initial_bw_state", shape=bw_cells.state_size) # initial_fw_state_tiled = tf.tile(initial_fw_state, [hparams.batch_size, 1]) # initial_bw_state_tiled = tf.tile(initial_bw_state, [hparams.batch_size, 1]) # run bdrnn outputs, output_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cells, cell_bw=bw_cells, inputs=inputs, sequence_length=seq_len, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32) # outputs is a tuple (output_fw, output_bw) # output_fw/output_bw are tensors [batch_size, max_time, cell.output_size] # outputs_states is a tuple (output_state_fw, output_state_bw) containing final states for # forward and backward rnn # concatenate the outputs of each direction combined_outputs = tf.concat([outputs[0], outputs[1]], axis=-1) # dense output layers dense1 = tf.layers.dense(inputs=combined_outputs, units=hparams.num_dense_units, activation=tf.nn.relu, use_bias=True) drop1 = tf.layers.dropout( inputs=dense1, rate=hparams.dropout, training=self.mode == tf.contrib.learn.ModeKeys.TRAIN) dense2 = tf.layers.dense(inputs=drop1, units=hparams.num_dense_units, activation=tf.nn.relu, use_bias=True) drop2 = tf.layers.dropout( inputs=dense2, rate=hparams.dropout, training=self.mode == tf.contrib.learn.ModeKeys.TRAIN) logits = tf.layers.dense(inputs=drop2, units=hparams.num_labels, use_bias=False) # mask out entries longer than target sequence length mask = tf.sequence_mask(seq_len, dtype=tf.float32) #stop gradient thru labels by crossent op tgt_outputs = tf.stop_gradient(tgt_outputs) crossent = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=tgt_outputs, name="crossent") # divide loss by batch_size * mean(seq_len) loss = (tf.reduce_sum(crossent * mask) / (hparams.batch_size * tf.reduce_mean(tf.cast(seq_len, tf.float32)))) metrics = [] update_ops = [] if self.mode == tf.contrib.learn.ModeKeys.EVAL: predictions = tf.argmax(input=logits, axis=-1) tgt_labels = tf.argmax(input=tgt_outputs, axis=-1) acc, acc_update = tf.metrics.accuracy(predictions=predictions, labels=tgt_labels, weights=mask) # confusion matrix targets_flat = tf.reshape(tgt_labels, [-1]) predictions_flat = tf.reshape(predictions, [-1]) mask_flat = tf.reshape(mask, [-1]) cm, cm_update = streaming_confusion_matrix( labels=targets_flat, predictions=predictions_flat, num_classes=hparams.num_labels, weights=mask_flat) tf.add_to_collection("eval", cm_summary(cm, hparams.num_labels)) metrics = [acc, cm] update_ops = [acc_update, cm_update] return logits, loss, metrics, update_ops
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" attention_option = hparams.attention attention_architecture = hparams.attention_architecture if attention_architecture != "standard": raise ValueError("Unknown attention architecture %s" % attention_architecture) num_units = hparams.num_units num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers num_gpus = hparams.num_gpus beam_width = hparams.beam_width dtype = tf.float32 if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width) source_sequence_length = tf.contrib.seq2seq.tile_batch( source_sequence_length, multiplier=beam_width) encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=beam_width) batch_size = self.batch_size * beam_width else: batch_size = self.batch_size if hparams.model in ('model0', 'model1', 'model2'): att_memory = tf.contrib.layers.fully_connected( memory, num_units, activation_fn=None, weights_initializer=tf.random_uniform_initializer(-0.1, 0.1)) cell = NTMCell(num_layers, num_units, use_att_memory=True, att_memory=att_memory, att_memory_size=hparams.src_max_len, att_memory_vector_dim=num_units, use_ext_memory=(hparams.model == 'model2'), ext_memory_size=hparams.num_memory_locations if hparams.model == 'model2' else None, ext_memory_vector_dim=hparams.memory_unit_size if hparams.model == 'model2' else None, ext_read_head_num=hparams.read_heads if hparams.model == 'model2' else None, ext_write_head_num=hparams.write_heads if hparams.model == 'model2' else None, dropout=hparams.dropout, batch_size=batch_size, mode=self.mode, output_dim=num_units, addressing_mode='content' if hparams.model == 'model0' else 'content_and_location') decoder_initial_state = cell.zero_state(batch_size, dtype) if hparams.pass_hidden_state: decoder_initial_state = tuple([encoder_state] + list(decoder_initial_state[1:])) else: attention_mechanism = create_attention_mechanism( attention_option, num_units, memory, source_sequence_length) cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn, num_proj=None, num_cells=2 if (hparams.encoder_type == "bi") else 1) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width == 0) cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, name="attention") # TODO(thangluong): do we need num_layers, num_gpus? cell = tf.contrib.rnn.DeviceWrapper( cell, model_helper.get_device_str(num_layers - 1, num_gpus)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state( batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state