def rcnn_base(inputs, hp, rois, roi_scores, bbox_labels, roi_pool_layer, head_to_tail, trainable=True, anchor_labels=None, cls_weights_initializer=None, reg_weights_initializer=None): """ A Region Proposal Network (RPN) takes an image (of any size) as input and outputs a set of rectangular object proposals, each with an objectness score[quoted from Faster-RCNN] :param inputs: image :param hp: hyper parameters :param rois: regions of interest :param roi_scores: scores of all rois :param roi_pool_layer: rois' pooling function(layer) :param head_to_tail: fully connect network :param trainable: whether to train this network :param bbox_labels: target bounding box :param anchor_labels: anchor labels when using rpn to generate roi :param cls_weights_initializer: weights initializer for classification layer :param reg_weights_initializer: weights initializer for regression layer :return: rect object proposals, scores, print_pool(dict) for debugging, activation(dict) for visualization """ # For debugging print_pool = dict() # For activation activation = None with tf.variable_scope("rcnn"): with tf.device(helper.get_device_str(device_id=0, num_gpus=hp.num_gpus)): # Fill rcnn's bbox_label, class_label, in_weights, out_weights, rois rcnn_info, rois, _ = helper.pack_proposal_info( anchor_labels, rois, bbox_scores=roi_scores, bbox_targets=tf.squeeze(bbox_labels, axis=0), num_class=hp.num_class) pool = roi_pool_layer(inputs, rois) fc = head_to_tail(pool) probs, predicts, scores = _rcnn_cls_layer( fc, hp.num_class, trainable=trainable, weights_initializer=cls_weights_initializer) deltas = _rcnn_reg_layer( fc, 4 * hp.num_class, trainable=trainable, weights_initializer=reg_weights_initializer) # Pack rcnn info into dict for calculating loss, only for training misc.append_params(rcnn_info, class_scores=scores, class_predicts=predicts, class_probs=probs, bbox_predicts=deltas) return rcnn_info, print_pool, activation
def _build_encoder_simple(model, intent, intent_length, num_units): """Build an encoder for intent.""" with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] encoder_emb_inp = tf.nn.embedding_lookup(model.embedding_encoder, intent) cell = model_helper._single_cell( num_units, model.hparams.dropout, model.mode, residual_connection=False, device_str=model_helper.get_device_str(model.global_gpu_num, model.hparams.num_gpus)) model.global_gpu_num += 1 encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, encoder_emb_inp, dtype=dtype, sequence_length=intent_length, time_major=False, swap_memory=True) return encoder_outputs, encoder_state, encoder_emb_inp
def _build_encoder_hierarchial(model, data_source, num_units): """Build an encoder for kb.""" source = data_source # bs*num_entry, 13 with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] encoder_emb_inp = tf.nn.embedding_lookup(model.embedding_encoder, source) # Encoder_outpus: [max_time, batch_size, num_units] cell_0 = model_helper._single_cell( num_units, model.hparams.dropout, model.mode, residual_connection=False, device_str=model_helper.get_device_str(model.global_gpu_num, model.hparams.num_gpus)) model.global_gpu_num += 1 with tf.variable_scope("hierarchial_rnn_1") as scope: _, encoder_final_states_0 = tf.nn.dynamic_rnn(cell_0, encoder_emb_inp, dtype=dtype, time_major=False, swap_memory=True) encoder_final_states_0 = tf.reshape(encoder_final_states_0, [model.batch_size, -1, num_units]) cell_1 = model_helper._single_cell( num_units, model.hparams.dropout, model.mode, residual_connection=False, device_str=model_helper.get_device_str(model.global_gpu_num, model.hparams.num_gpus)) model.global_gpu_num += 1 with tf.variable_scope("hierarchial_rnn_2") as scope: encoder_outputs_1, encoder_state_1 = tf.nn.dynamic_rnn( cell_1, encoder_final_states_0, dtype=dtype, time_major=False, swap_memory=True) return encoder_outputs_1, encoder_state_1, encoder_emb_inp
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss_tuple, final_context_state, sample_id), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: loss = the total loss / batch_size. final_context_state: the final state of decoder RNN. sample_id: sampling indices. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# Creating %s graph ..." % self.mode) # Projection if not self.extract_encoder_layers: with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = tf.layers.Dense( self.tgt_vocab_size, use_bias=False, name="output_projection") with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder if hparams.language_model: # no encoder for language modeling utils.print_out(" language modeling: no encoder") self.encoder_outputs = None encoder_state = None else: self.encoder_outputs, encoder_state = self._build_encoder(hparams) # Skip decoder if extracting only encoder layers if self.extract_encoder_layers: return ## Decoder logits, decoder_cell_outputs, sample_id, final_context_state = ( self._build_decoder(self.encoder_outputs, encoder_state, hparams)) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device(model_helper.get_device_str(self.num_encoder_layers - 1, self.num_gpus)): loss = self._compute_loss(logits, decoder_cell_outputs) else: loss = tf.constant(0.0) return logits, loss, final_context_state, sample_id
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" attention_option = hparams.attention attention_architecture = hparams.attention_architecture if attention_architecture != "standard": raise ValueError( "Unknown attention architecture %s" % attention_architecture) num_units = hparams.num_units num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers beam_width = hparams.beam_width dtype = tf.float32 if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width) source_sequence_length = tf.contrib.seq2seq.tile_batch(source_sequence_length, multiplier=beam_width) encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=beam_width) batch_size = self.batch_size * beam_width else: batch_size = self.batch_size attention_mechanism = create_attention_mechanism(attention_option, num_units, memory, source_sequence_length) cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, base_gpu=hparams.base_gpu, mode=self.mode, single_cell_fn=self.single_cell_fn) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width == 0) cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, name="attention") cell = tf.contrib.rnn.DeviceWrapper(cell, model_helper.get_device_str(hparams.base_gpu)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state(batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss_tuple, final_context_state, sample_id), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: loss = the total loss / batch_size. """ utils.print_out("\n# Creating %s graph ..." % self.mode) with tf.variable_scope(scope or "rnn", dtype=self.dtype): # Encoder self.encoder_outputs, encoder_state = self._build_encoder(hparams) fw_state, bw_state = encoder_state print('encoder_outputs: ', self.encoder_outputs.shape) print('fw_state.h: ', fw_state.h.shape) print('bw_state.h: ', bw_state.h.shape) # Linear layer for classification of intent encoder_last_state = tf.concat([fw_state.h, bw_state.h], axis=1) print('encoder_last_state: ', encoder_last_state.shape) print() encoder_output_size = encoder_last_state.get_shape()[1].value print('encoder_output_size: ', encoder_output_size) w = tf.get_variable('w', [encoder_output_size, self.lbl_vocab_size], dtype=tf.float32) w_t = tf.transpose(w) v = tf.get_variable('v', [self.lbl_vocab_size], dtype=tf.float32) # apply the linear layer label_logits = tf.nn.xw_plus_b(encoder_last_state, w, v) label_pred = tf.argmax(label_logits, 1) print('label_scores: ', label_logits.shape) print() ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str( self.num_encoder_layers - 1, self.num_gpus)): loss = self._compute_loss(label_logits) else: loss = tf.constant(0.0) return label_logits, loss, label_pred
def create_train_model(model_creator, hparams, scope=None, single_cell_fn=None): """Create train graph, model, and iterator.""" src_file = "%s.%s" % (hparams.train_prefix, hparams.src) tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(): tgt_vocab_table = vocab_utils.create_tgt_vocab_table(tgt_vocab_file) src_dataset = tf.contrib.data.TextLineDataset(src_file) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, tgt_vocab_table, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder) # Note: One can set model_device_fn to `tf.train.replica_device_setter(ps_tasks)` for distributed training. with tf.device(model_helper.get_device_str(hparams.base_gpu)): # model_creator: 模型 model = model_creator(hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, target_vocab_table=tgt_vocab_table, scope=scope, single_cell_fn=single_cell_fn) return TrainModel(graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss, final_context_state), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: the total loss / batch_size. final_context_state: The final state of decoder RNN. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams) print("logits", logits) print("sample_id", sample_id) print("final_context_state", final_context_state) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str( self.num_encoder_layers - 1, self.num_gpus)): loss = self._compute_loss(logits) else: loss = logits return logits, loss, final_context_state, sample_id
def build_graph(self, flags): """Build A2C graph.""" with tf.variable_scope(flags.model_name): # TODO. Fix tf.device for multiple gpu with tf.device(model_helper.get_device_str(0, self.num_gpus)): c1 = self.conv2d(self.state, self.cv_num_outputs, self.f_height, self.f_width, self.stride, scope="conv2d_1") c2 = self.conv2d(c1, self.cv_num_outputs*2, self.f_height/2, self.f_width/2, self.stride/2, scope="conv2d_2") fc = self.linear(self.flatten(c2), self.num_units, activation_fn=tf.nn.relu, scope='flat') # modify the shape of the fc before rnn # [1, None, self.flat_outputs] rnn_input = tf.reshape(fc, [1, -1, self.num_units]) step_size = tf.shape(rnn_input)[1:2] cell = self.create_rnn_cell() self.h_in = cell.zero_state(1, tf.float32) rnn_output, self.h_out = tf.nn.dynamic_rnn( cell, rnn_input, initial_state=self.h_in, sequence_length=step_size) rnn_output = tf.reshape(rnn_output, [-1, self.num_units]) # policy self.policy = self.linear( rnn_output, self.action_size, activation_fn=tf.nn.softmax, scope='policy') # value self.value = self.linear( rnn_output, 1, scope='value') # compute loss if self.mode != tf.estimator.ModeKeys.PREDICT: loss = self.compute_loss() else: loss = tf.constant(0.0) return loss
def __init__(self, hparams, mode, iterator, target_vocab_table, reverse_target_vocab_table=None, scope=None, single_cell_fn=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. single_cell_fn: allow for adding customized cell. When not specified, we default to model_helper._single_cell """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.tgt_vocab_table = target_vocab_table self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major self.cnn_input = self.iterator.source if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.cnn = AlexNet(self.cnn_input, (1 - hparams.dropout), model_helper.get_device_str(hparams.base_gpu)) else: self.cnn = AlexNet(self.cnn_input, 1, model_helper.get_device_str(hparams.base_gpu)) # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") # To make it flexible for external code to add other cell types # If not specified, we will later use model_helper._single_cell self.single_cell_fn = single_cell_fn ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum(self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup(tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length) ## Learning rate print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor)) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: if hparams.optimizer == "sgd": self.learning_rate = tf.cond(self.global_step < hparams.start_decay_step, lambda: tf.constant(hparams.learning_rate), lambda: tf.train.exponential_decay(hparams.learning_rate, (self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate self.learning_rate = tf.constant(hparams.learning_rate) opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss)] + gradient_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver if hparams.eval_on_fly: self.saver = tf.train.Saver(tf.global_variables(), save_relative_paths= True) else: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None, save_relative_paths= True) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" # No Attention if not self.has_attention: return super(AttentionModel, self)._build_decoder_cell(hparams, encoder_outputs, encoder_state, source_sequence_length) elif hparams.attention_architecture != "standard": raise ValueError("Unknown attention architecture %s" % hparams.attention_architecture) num_units = hparams.num_units num_layers = self.num_decoder_layers #num_residual_layers = self.num_decoder_residual_layers infer_mode = hparams.infer_mode dtype = tf.float32 # Ensure memory is batch-major if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if (self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode == "beam_search"): memory, source_sequence_length, encoder_state, batch_size = ( self._prepare_beam_search_decoder_inputs( hparams.beam_width, memory, source_sequence_length, encoder_state)) else: batch_size = self.batch_size # Attention attention_mechanism = self.attention_mechanism_fn( hparams.attention, num_units, memory, source_sequence_length, self.mode) cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=self.num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode != "beam_search") cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, output_attention=hparams.output_attention, name="attention") # TODO(thangluong): do we need num_layers, num_gpus? cell = tf.contrib.rnn.DeviceWrapper( cell, model_helper.get_device_str(num_layers - 1, self.num_gpus)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state( batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state
def build_graph(model, hparams, scope=None): """build the computation graph.""" utils.print_out("# creating %s graph ..." % model.mode) dtype = tf.float32 num_layers = hparams.num_layers num_gpus = hparams.num_gpus with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder # Look up embedding, emp_inp: [max_time, batch_size, num_units] with tf.variable_scope("encoder_emb_inp"): encoder_emb_inp = tf.nn.embedding_lookup(model.embedding_encoder, model.iterator.source) action_emb_inp = tf.nn.embedding_lookup(model.embedding_encoder, model.iterator.action) with tf.variable_scope("encoder1_intent"): res = _build_encoder_simple(model, model.iterator.intent, model.iterator.intent_len, num_units=hparams.encoder_intent_unit) _, encoder_state1_aux, _ = res with tf.variable_scope("encoder2_kb"): res = _build_encoder_hierarchial(model, model.iterator.kb, num_units=hparams.encoder_kb_unit) _, encoder_state2_aux, _ = res with tf.variable_scope("encoder1"): model.encoder_input_projection1 = layers_core.Dense( hparams.num_units, use_bias=False, name="encoder_1_input_projection") tiled_encoder_state1_aux = tf.reshape( encoder_state1_aux, [model.batch_size, 1, hparams.encoder_intent_unit]) time_step = tf.shape(encoder_emb_inp)[1] tiled_encoder_state1_aux = tf.tile(tiled_encoder_state1_aux, [1, time_step, 1]) concat1 = tf.concat([encoder_emb_inp, tiled_encoder_state1_aux], 2) # emb_intnt+num_unites encoder1_input = model.encoder_input_projection1(concat1) encoder_outputs1, encoder_state1 = _build_encoder( model, encoder1_input, hparams) # 1= customer, 2= agent with tf.variable_scope("encoder2"): model.encoder_input_projection2 = layers_core.Dense( hparams.num_units, use_bias=False, name="encoder_2_input_projection") tiled_encoder_state2_aux = tf.reshape( encoder_state2_aux, [model.batch_size, 1, hparams.encoder_kb_unit]) time_step = tf.shape(encoder_emb_inp)[1] tiled_encoder_state2_aux = tf.tile(tiled_encoder_state2_aux, [1, time_step, 1]) concat2 = tf.concat([encoder_emb_inp, tiled_encoder_state2_aux], 2) # emb_intnt+num_unites encoder2_input = model.encoder_input_projection2(concat2) encoder_outputs2, encoder_state2 = _build_encoder( model, encoder2_input, hparams) ## Decoder with tf.variable_scope("decoder1"): res = _build_decoder(model, encoder_outputs1, encoder_state1, hparams, vocab_utils.start_of_turn1, vocab_utils.start_of_turn2, model.output_layer1, encoder_state1_aux) logits_trian1, _, sample_id_train1, sample_id_infer1 = res with tf.variable_scope("decoder2"): res = _build_decoder(model, encoder_outputs2, encoder_state2, hparams, vocab_utils.start_of_turn2, vocab_utils.start_of_turn1, model.output_layer2, encoder_state2_aux) logits_trian2, _, sample_id_train2, sample_id_infer2 = res with tf.variable_scope("decoder_action"): res = _build_decoder_action( model, encoder_state2, hparams, hparams.t1, # dialogue ends with t2, action starts with t1 hparams.t2, model.output_layer_action) logits_trian3, _, sample_id_train3, sample_id_infer3 = res with tf.variable_scope("value_network1"): res = _build_value_network(model, encoder_emb_inp, action_emb_inp, encoder_state1_aux, model.vn_project11, model.vn_project12, hparams) dialogue1_val, _ = res with tf.variable_scope("value_network2"): res = _build_value_network(model, encoder_emb_inp, action_emb_inp, encoder_state2_aux, model.vn_project21, model.vn_project22, hparams, True) dialogue2_val, action_val = res model.logits_trian1 = logits_trian1 model.logits_trian2 = logits_trian2 model.dialogue1_val = dialogue1_val model.dialogue2_val = dialogue2_val if model.mode in [ tf.contrib.learn.ModeKeys.TRAIN, tf.contrib.learn.ModeKeys.EVAL, dialogue_utils.mode_self_play_mutable ]: with tf.device( model_helper.get_device_str(num_layers - 1, num_gpus)): sl_loss, sl_loss_arr = _compute_loss(model, logits_trian1, logits_trian2, logits_trian3) with tf.device( model_helper.get_device_str(num_layers - 1, num_gpus)): rl_loss_arr = _compute_loss_selfplay(model, logits_trian1, logits_trian2, logits_trian3, dialogue1_val, dialogue2_val, action_val) elif model.mode == tf.contrib.learn.ModeKeys.INFER or model.mode == dialogue_utils.mode_self_play_immutable: sl_loss, sl_loss_arr, rl_loss_arr = None, None, None else: raise ValueError("mode not known") sample_id_arr_train = [ sample_id_train1, sample_id_train2, sample_id_train3 ] sample_id_arr_infer = [ sample_id_infer1, sample_id_infer2, sample_id_infer3 ] return sl_loss, sl_loss_arr, rl_loss_arr, sample_id_arr_train, sample_id_arr_infer
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" attention_option = hparams.attention attention_architecture = hparams.attention_architecture if attention_architecture != "standard": raise ValueError("Unknown attention architecture %s" % attention_architecture) num_units = hparams.num_units num_layers = hparams.num_layers num_residual_layers = hparams.num_residual_layers num_gpus = hparams.num_gpus beam_width = hparams.beam_width dtype = tf.float32 if self.time_major: memory = tf.transpose(encoder_outputs, [1, 0, 2]) else: memory = encoder_outputs if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width) source_sequence_length = tf.contrib.seq2seq.tile_batch( source_sequence_length, multiplier=beam_width) encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=beam_width) batch_size = self.batch_size * beam_width else: batch_size = self.batch_size if hparams.model in ('model0', 'model1', 'model2'): att_memory = tf.contrib.layers.fully_connected( memory, num_units, activation_fn=None, weights_initializer=tf.random_uniform_initializer(-0.1, 0.1)) cell = NTMCell(num_layers, num_units, use_att_memory=True, att_memory=att_memory, att_memory_size=hparams.src_max_len, att_memory_vector_dim=num_units, use_ext_memory=(hparams.model == 'model2'), ext_memory_size=hparams.num_memory_locations if hparams.model == 'model2' else None, ext_memory_vector_dim=hparams.memory_unit_size if hparams.model == 'model2' else None, ext_read_head_num=hparams.read_heads if hparams.model == 'model2' else None, ext_write_head_num=hparams.write_heads if hparams.model == 'model2' else None, dropout=hparams.dropout, batch_size=batch_size, mode=self.mode, output_dim=num_units, addressing_mode='content' if hparams.model == 'model0' else 'content_and_location') decoder_initial_state = cell.zero_state(batch_size, dtype) if hparams.pass_hidden_state: decoder_initial_state = tuple([encoder_state] + list(decoder_initial_state[1:])) else: attention_mechanism = create_attention_mechanism( attention_option, num_units, memory, source_sequence_length) cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=num_gpus, mode=self.mode, single_cell_fn=self.single_cell_fn, num_proj=None, num_cells=2 if (hparams.encoder_type == "bi") else 1) # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width == 0) cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=num_units, alignment_history=alignment_history, name="attention") # TODO(thangluong): do we need num_layers, num_gpus? cell = tf.contrib.rnn.DeviceWrapper( cell, model_helper.get_device_str(num_layers - 1, num_gpus)) if hparams.pass_hidden_state: decoder_initial_state = cell.zero_state( batch_size, dtype).clone(cell_state=encoder_state) else: decoder_initial_state = cell.zero_state(batch_size, dtype) return cell, decoder_initial_state
def rpn_base(inputs, hp, im_info, bbox_labels, feat_stride=16, anchor_count=9, trainable=True): """ A Region Proposal Network (RPN) takes an image (of any size) as input and outputs a set of rectangular object proposals, each with an objectness score[quoted from Faster-RCNN] :param inputs: image :param hp: hyper parameters :param im_info: image size [height, width, channel] :param feat_stride: num of division :param anchor_count: original generate anchors' count :param trainable: whether to train this network :param bbox_labels: target bounding box :return: rect object proposals, scores, print_pool(dict) for debugging, activation(dict) for visualization """ # For debugging print_pool = dict() # For activation activation = None with tf.variable_scope("rpn"): with tf.device(helper.get_device_str(device_id=0, num_gpus=hp.num_gpus)): # Build the anchors for image anchors, all_count = helper.generate_img_anchors( im_info, feat_stride, ratios=hp.anchor_ratios, scales=hp.anchor_scales) print_pool.update(anchor_shape=tf.shape(anchors)) # rpn_conv = layers.conv2d(inputs, hp.rpn_channel, [3, 3], trainable=self.trainable, # weights_initializer=self.initializer, scope="rpn_conv_3x3") rpn_conv = slim.conv2d(inputs, hp.rpn_channel, [3, 3], trainable=trainable, scope="rpn_conv_3x3") # Visualize rpn activation = rpn_conv probs, predicts, scores, reshaped_scores = _rpn_cls_layer( rpn_conv, anchor_count * 2) deltas = _rpn_reg_layer(rpn_conv, anchor_count * 4) print_pool.update(deltas_shape=tf.shape(deltas)) rpn_info = dict() if trainable: # Generate rois, roi scores on image rois, roi_scores = helper.sample_rois_from_anchors( probs, deltas, im_info, anchors, anchor_count) # Gather info for calculating rpn's loss # Fill rpn's bbox_label, class_label, in_weights, out_weights rpn_info = helper.pack_anchor_info( im_info, anchors, ori_anchor_count=anchor_count, bbox_targets=tf.squeeze(bbox_labels, axis=0), anchor_scores=scores) else: # Why use probs as scores? rois, _ = helper.sample_rois_from_anchors( probs, deltas, im_info, anchors, anchor_count) # Fill rest info of rpn misc.append_params( rpn_info, rois=rois, class_probs=probs, class_predicts=predicts, class_reshaped_scores=reshaped_scores, sigma=hp.rpn_sigma, # Using the full score instead of roi_score so that gradient # can back passing all params in rpn_cls_layer bbox_predicts=deltas, bbox_scores=scores) return rois, rpn_info, print_pool, activation
def _build_decoder(self, encoder_outputs, encoder_state, hparams): """Build and run a RNN decoder with a final projection layer. Args: encoder_outputs: The outputs of encoder for every time step. encoder_state: The final state of the encoder. hparams: The Hyperparameters configurations. Returns: A tuple of final logits and final decoder state: logits: size [time, batch_size, vocab_size] when time_major=True. """ tgt_sos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.sos)), tf.int32) tgt_eos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.eos)), tf.int32) num_layers = hparams.num_layers num_gpus = hparams.num_gpus iterator = self.iterator # maximum_iteration: The maximum decoding steps. maximum_iterations = self._get_infer_maximum_iterations( hparams, iterator.source_sequence_length) ## Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = self._build_decoder_cell( hparams, encoder_outputs, encoder_state, iterator.source_sequence_length) ## Train or eval if self.mode != tf.contrib.learn.ModeKeys.INFER: # decoder_emp_inp: [max_time, batch_size, num_units] target_input = iterator.target_input if self.time_major: target_input = tf.transpose(target_input) decoder_emb_inp = tf.nn.embedding_lookup( self.embedding_decoder, target_input) # Helper helper = tf.contrib.seq2seq.TrainingHelper( decoder_emb_inp, iterator.target_sequence_length, time_major=self.time_major) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) sample_id = outputs.sample_id # Note: there's a subtle difference here between train and inference. # We could have set output_layer when create my_decoder # and shared more code between train and inference. # We chose to apply the output_layer to all timesteps for speed: # 10% improvements for small models & 20% for larger ones. # If memory is a concern, we should apply output_layer per timestep. device_id = num_layers if num_layers < num_gpus else ( num_layers - 1) with tf.device(model_helper.get_device_str( device_id, num_gpus)): logits = self.output_layer(outputs.rnn_output) ## Inference else: beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id if beam_width > 0: my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=self.embedding_decoder, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=self.output_layer, length_penalty_weight=length_penalty_weight) else: # Helper helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding_decoder, start_tokens, end_token) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, output_layer=self.output_layer # applied per timestep ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=maximum_iterations, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) if beam_width > 0: logits = tf.no_op() sample_id = outputs.predicted_ids else: logits = outputs.rnn_output sample_id = outputs.sample_id return logits, sample_id, final_context_state
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss_tuple, final_context_state, sample_id), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: loss = the total loss / batch_size. final_context_state: the final state of decoder RNN. sample_id: sampling indices. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("\n# Creating %s graph ..." % self.mode) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = tf.layers.Dense(self.tgt_vocab_size, use_bias=False, name="output_projection") with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder self.encoder_outputs, encoder_state = self._build_encoder(hparams) fw_state, bw_state = encoder_state print('encoder_outputs: ', self.encoder_outputs.shape) print('fw_state.h: ', fw_state.h.shape) print('bw_state.h: ', bw_state.h.shape) # Linear layer for classification of intent encoder_last_state = tf.concat([fw_state.h, bw_state.h], axis=1) print('encoder_last_state: ', encoder_last_state.shape) print() encoder_output_size = encoder_last_state.get_shape()[1].value print('encoder_output_size: ', encoder_output_size) w = tf.get_variable('w', [encoder_output_size, self.lbl_vocab_size], dtype=tf.float32) w_t = tf.transpose(w) v = tf.get_variable('v', [self.lbl_vocab_size], dtype=tf.float32) # apply the linear layer label_logits = tf.nn.xw_plus_b(encoder_last_state, w, v) label_pred = tf.argmax(label_logits, 1) print('label_scores: ', label_logits.shape) print() ## Decoder slot_logits, decoder_cell_outputs, sample_id, final_context_state = ( self._build_decoder(self.encoder_outputs, encoder_state, hparams)) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str( self.num_encoder_layers - 1, self.num_gpus)): loss = self._compute_loss(label_logits, slot_logits, decoder_cell_outputs) else: loss = [tf.constant(0.0), tf.constant(0.0)] return [label_logits, slot_logits], loss, final_context_state, \ sample_id, label_pred
def _build_decoder(self, encoder_outputs, encoder_state, hparams): """Build and run a RNN decoder with a final projection layer. Args: encoder_outputs: The outputs of encoder for every time step. encoder_state: The final state of the encoder. hparams: The Hyperparameters configurations. Returns: A tuple of final logits and final decoder state: logits: size [time, batch_size, vocab_size] when time_major=True. """ tgt_sos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.sos)), tf.int32) tgt_eos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(hparams.eos)), tf.int32) iterator = self.iterator # maximum_iteration: The maximum decoding steps. maximum_iterations = self._get_infer_maximum_iterations( hparams, iterator.source_sequence_length) ## Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = self._build_decoder_cell( hparams, encoder_outputs, encoder_state, iterator.source_sequence_length) # Optional ops depends on which mode we are in and which loss function we # are using. logits = tf.no_op() decoder_cell_outputs = None ## Train or eval if self.mode != tf.contrib.learn.ModeKeys.INFER: # decoder_emp_inp: [max_time, batch_size, num_units] target_input = iterator.target_input if self.time_major: target_input = tf.transpose(target_input) decoder_emb_inp = tf.nn.embedding_lookup( self.embedding_decoder, target_input) # Helper helper = tf.contrib.seq2seq.TrainingHelper( decoder_emb_inp, iterator.target_sequence_length, time_major=self.time_major) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) sample_id = outputs.sample_id if self.num_sampled_softmax > 0: # Note: this is required when using sampled_softmax_loss. decoder_cell_outputs = outputs.rnn_output # Note: there's a subtle difference here between train and inference. # We could have set output_layer when create my_decoder # and shared more code between train and inference. # We chose to apply the output_layer to all timesteps for speed: # 10% improvements for small models & 20% for larger ones. # If memory is a concern, we should apply output_layer per timestep. num_layers = self.num_decoder_layers num_gpus = self.num_gpus device_id = num_layers if num_layers < num_gpus else ( num_layers - 1) # Colocate output layer with the last RNN cell if there is no extra GPU # available. Otherwise, put last layer on a separate GPU. with tf.device(model_helper.get_device_str( device_id, num_gpus)): logits = self.output_layer(outputs.rnn_output) if self.num_sampled_softmax > 0: logits = tf.no_op( ) # unused when using sampled softmax loss. ## Inference else: infer_mode = hparams.infer_mode start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id utils.print_out(" decoder: infer_mode=%sbeam_width=%d, " "length_penalty=%f, coverage_penalty=%f" % (infer_mode, hparams.beam_width, hparams.length_penalty_weight, hparams.coverage_penalty_weight)) if infer_mode == "beam_search": beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight coverage_penalty_weight = hparams.coverage_penalty_weight my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=self.embedding_decoder, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=self.output_layer, length_penalty_weight=length_penalty_weight, coverage_penalty_weight=coverage_penalty_weight) elif infer_mode == "sample": # Helper sampling_temperature = hparams.sampling_temperature assert sampling_temperature > 0.0, ( "sampling_temperature must greater than 0.0 when using sample" " decoder.") helper = tf.contrib.seq2seq.SampleEmbeddingHelper( self.embedding_decoder, start_tokens, end_token, softmax_temperature=sampling_temperature, seed=self.random_seed) elif infer_mode == "greedy": helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding_decoder, start_tokens, end_token) else: raise ValueError("Unknown infer_mode '%s'", infer_mode) if infer_mode != "beam_search": my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, output_layer=self.output_layer # applied per timestep ) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=maximum_iterations, output_time_major=self.time_major, swap_memory=True, scope=decoder_scope) if infer_mode == "beam_search": sample_id = outputs.predicted_ids else: logits = outputs.rnn_output sample_id = outputs.sample_id return logits, decoder_cell_outputs, sample_id, final_context_state
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss, final_context_state), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: the total loss / batch_size. final_context_state: The final state of decoder RNN. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 num_layers = hparams.num_layers num_gpus = hparams.num_gpus if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.use_fed_source = tf.placeholder(tf.bool) self.fed_source = tf.placeholder(tf.int32, shape=(None, hparams.src_max_len)) with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams) if hparams.beam_width > 0 and self.mode == tf.contrib.learn.ModeKeys.INFER: cell_state = final_context_state.cell_state else: cell_state = final_context_state if hparams.mann == 'ntm': if hparams.model in ('model0', 'model1'): print('here', final_context_state) final_state = Model1NTMState(*cell_state) elif hparams.model == 'model2': final_state = Model2NTMState(*cell_state) else: final_state = Model3NTMState(*cell_state) self.att_w_history = tf.no_op() self.ext_w_history = tf.no_op() if hparams.record_w_history: if hparams.mann == 'ntm' and hparams.model in ( 'model0', 'model1', 'model2', 'model3'): att_w_history = final_state.att_w_history.stack() self.att_w_history = tf.transpose(att_w_history, [1, 2, 0]) if hparams.mann == 'ntm' and hparams.model in ('model2', 'model3'): self.ext_w_history = map( lambda hist: tf.transpose(hist.stack(), [1, 2, 0]), final_state.ext_w_history) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str(num_layers - 1, num_gpus)): loss = self._compute_loss(logits) else: loss = None return logits, loss, final_context_state, sample_id