def generate_optimizer(self, loss, params, name, learning_rate, max_gradient_norm): """generates optimizer.""" if self.hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate, name="SGD_self_play_" + name) else: opt = tf.train.AdamOptimizer(learning_rate, name="ADAM_self_play_" + name) gradients = tf.gradients(loss, params, colocate_gradients_with_ops=self.hparams. colocate_gradients_with_ops, name="gradients_" + name) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=max_gradient_norm) update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step, name=name) return update, gradient_norm_summary
def _set_train_or_infer(self, res, reverse_target_vocab_table, reverse_target_intent_vocab_table, hparams): """Set up training and inference.""" if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.label_pred = res self.sample_intent = reverse_target_intent_vocab_table.lookup( tf.to_int64(self.label_pred)) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm_summary = grad_norm_summary self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = self._get_train_summary() elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Print trainable variables utils.print_out("# Trainable variables") utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _set_train_or_infer(self, res, hparams): """Set up training.""" if self.mode == tf.contrib.learn.ModeKeys.INFER: self.predicted_ids = res[1] params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: loss = res[0] self.loss = loss mlperf_log.gnmt_print(key=mlperf_log.OPT_LR, value=hparams.learning_rate) if hparams.lottery_force_learning_rate is not None: self.learning_rate = lottery.get_lr_tensor(hparams.values()) else: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer mlperf_log.gnmt_print(key=mlperf_log.OPT_NAME, value=hparams.optimizer) if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=0.9) mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=0.999) mlperf_log.gnmt_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=1e-8) opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) if hparams.use_tpu: opt = tf.contrib.tpu.CrossShardOptimizer(opt) # Gradients gradients = tf.gradients( loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_grads, grad_norm = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Print trainable variables utils.print_out("# Trainable variables") utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def optimizer(hparams, loss, global_step): opt = None learning_rate = learning_rate_update(hparams, global_step) learning_rate = tf.maximum(tf.constant(0.00004), learning_rate) if hparams.opttype == 'SGD': opt = tf.train.GradientDescentOptimizer(learning_rate) if hparams.opttype == 'Adam': opt = tf.train.AdamOptimizer(learning_rate) if hparams.opttype == 'Nadam': opt = tf.contrib.opt.NadamOptimizer(learning_rate) if hparams.opttype == 'Lazy': opt, learning_rate = get_lazy_opt(hparams) gradient, vars_param = zip(*opt.compute_gradients(loss)) clip_gradient, _ = _mh.gradient_clip(gradient) apply_gradient_op = opt.apply_gradients(zip(clip_gradient, vars_param), global_step=global_step) variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op, learning_rate
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None): assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.time_major = hparams.time_major self.single_cell_fn = None # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope("build_netword"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams) if self.mode == tf.estimator.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.estimator.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.estimator.ModeKeys.PREDICT: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.estimator.ModeKeys.PREDICT: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.estimator.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.estimator.ModeKeys.PREDICT: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, hparams, mode, iterator, target_vocab_table, reverse_target_vocab_table=None, scope=None, single_cell_fn=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. single_cell_fn: allow for adding customized cell. When not specified, we default to model_helper._single_cell """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.tgt_vocab_table = target_vocab_table self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major self.cnn_input = self.iterator.source if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.cnn = AlexNet(self.cnn_input, (1 - hparams.dropout), model_helper.get_device_str(hparams.base_gpu)) else: self.cnn = AlexNet(self.cnn_input, 1, model_helper.get_device_str(hparams.base_gpu)) # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") # To make it flexible for external code to add other cell types # If not specified, we will later use model_helper._single_cell self.single_cell_fn = single_cell_fn ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum(self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup(tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length) ## Learning rate print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor)) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: if hparams.optimizer == "sgd": self.learning_rate = tf.cond(self.global_step < hparams.start_decay_step, lambda: tf.constant(hparams.learning_rate), lambda: tf.train.exponential_decay(hparams.learning_rate, (self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate self.learning_rate = tf.constant(hparams.learning_rate) opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss)] + gradient_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver if hparams.eval_on_fly: self.saver = tf.train.Saver(tf.global_variables(), save_relative_paths= True) else: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None, save_relative_paths= True) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _deploy_exe_info(self, losses, info): with tf.name_scope("deploy_exe_info"): hp = self.hparams if self.trainable: # Train self.train_loss = losses params = tf.trainable_variables() if hp.tunable: learning_rate = hp.tune_rate else: learning_rate = hp.learning_rate self.learning_rate = tf.constant(learning_rate, dtype=tf.float32) # Warm-up self.learning_rate = self._get_learning_rate_warmup() # Decay self.learning_rate = self._get_learning_rate_decay() # Optimizer opt = tf.train.MomentumOptimizer(self.learning_rate, hp.momentum_factor) # Gradient gradients = tf.gradients(self.train_loss, params) # Gradient clip clipped_grads, grad_norm_summaries, grad_norm = helper.gradient_clip( gradients, max_gradient_norm=hp.max_grad_norm) # Gradient norm for summary in grad_norm_summaries: self._add_to_summaries(summary) self.grad_norm = grad_norm # Apply update to params self.update = opt.apply_gradients( zip(clipped_grads, params), global_step=self.global_step) # Trainable params summary print("# Trainable variables") print("Format: <name>, <shape>, <(soft) device placement>") for param in params: self.histogram.update({param.name: param}) print(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) self.histogram.update(train_loss=self.train_loss, learning_rate=self.learning_rate) if hp.forward_rcnn: self.class_predicts = self.reverse_cate_table.lookup( tf.to_int64(info["class_predicts"])) self.detected_images = tf.py_function( misc.draw_boxes_on_image, [self.images_data, info["bbox_labels"], info["class_scores"], self.class_predicts, self.im_info, hp.pixel_mean], Tout=tf.float32) self.train_summary = self._config_train_summary() elif self.predicable: # Infer stddevs = tf.tile(tf.constant(hp.bbox_norm_stddevs), multiples=hp.num_class) means = tf.tile(tf.constant(hp.bbox_norm_means), multiples=hp.num_class) deltas = info["bbox_predicts"] # Restore bbox predicts deltas = tf.add(tf.multiply(deltas, stddevs), means) info["bbox_predicts"] = deltas rois = info["rois"] self.class_scores = info["class_scores"] self.class_predicts = self.reverse_cate_table.lookup( tf.to_int64(info["class_predicts"])) # Get predicted ground-truth bbox self.bboxes = proposal_util.bboxes_regression(rois, deltas) self.detected_images = tf.py_function( misc.draw_boxes_on_image, [self.images_data, self.bboxes, self.class_scores, self.class_predicts, self.im_info, hp.pixel_mean], Tout=tf.float32) self.infer_summary = self._config_infer_summary() else: # Eval rois = info["rois"] deltas = info["bbox_predicts"] self.eval_loss = losses bboxes = proposal_util.bboxes_regression(rois, deltas) self.accuracy = misc.mean_avg_overlap( bboxes, self.bbox_labels) self.eval_summary = self._config_eval_summary()
def __init__(self, hparams, mode, iterator, handle, vocab_table, reverse_vocab_table=None, scope=None, extra_args=None): assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.handle = handle self.mode = mode self.vocab_table = vocab_table self.vocab_size = hparams.vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.hparams = hparams self.single_cell_fn = None self.global_gpu_num = 0 if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.shape(self.iterator.source)[0] # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer1 = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_1") self.output_layer2 = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_2") self.output_layer_action = layers_core.Dense( hparams.vocab_size, use_bias=False, name="output_projection_action") self.vn_project11 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_11") self.vn_project12 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_12") self.vn_project21 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_21") self.vn_project22 = layers_core.Dense( hparams.unit_value_network, use_bias=False, name="vn_project_22") ## Train graph sl_loss, sl_loss_arr, rl_loss_arr, sample_id_arr_train, sample_id_arr_infer = build_graph( self, hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = sl_loss self.all_train_loss = sl_loss_arr self.word_count = tf.reduce_sum(self.iterator.dialogue_len) self.sample_ids_arr = sample_id_arr_train self.sample_words_arr1 = [] self.sample_words_arr2 = [] source = self.iterator.source for i in range(len(self.sample_ids_arr)): element_infer = self.sample_ids_arr[i] element_src = source[0] # element_src=0 src = reverse_vocab_table.lookup(tf.to_int64(element_src)) infer = reverse_vocab_table.lookup( tf.to_int64(element_infer) )[0] # src can only get the first one so I only get the first inference if i == 0: self.sample_words_arr1.append((tf.constant(i), src, infer)) elif i == 1: self.sample_words_arr2.append((tf.constant(i), src, infer)) self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr # reinforcement updates elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = sl_loss self.all_eval_loss = sl_loss_arr elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.sample_ids_arr = sample_id_arr_infer self.sample_words_arr = [] self.source = reverse_vocab_table.lookup( tf.to_int64(iterator.source)) for element in self.sample_ids_arr: self.sample_words_arr.append( reverse_vocab_table.lookup(tf.to_int64(element))) elif self.mode in dialogue_utils.self_play_modes: #### self play self.train_loss = sl_loss self.all_train_loss = sl_loss_arr self.selfplay_agent_1_utt = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[0])) self.selfplay_agent_2_utt = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[1])) self.selfplay_action = reverse_vocab_table.lookup( tf.to_int64(sample_id_arr_infer[2])) if self.mode == dialogue_utils.mode_self_play_mutable: self.vl1, self.vl2, self.pl1, self.pl2, self.eq11, self.eq12, self.eq2 = rl_loss_arr # reinforcement updates if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.dialogue_len) ## Learning rate warmup_steps = hparams.learning_rate_warmup_steps warmup_factor = hparams.learning_rate_warmup_factor print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, " "decay_factor %g, learning_rate_warmup_steps=%d, " "learning_rate_warmup_factor=%g, starting_learning_rate=%g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor, warmup_steps, warmup_factor, (hparams.learning_rate * warmup_factor**warmup_steps))) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable: self.learning_rate = tf.constant(hparams.learning_rate) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) self.learning_rate = tf.cond( self.global_step < hparams.learning_rate_warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_decay_warump_cond") if hparams.optimizer == "sgd": self.learning_rate = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay(self.learning_rate, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised") opt = tf.train.GradientDescentOptimizer(self.learning_rate, name="SGD_supervised") tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate opt = tf.train.AdamOptimizer(self.learning_rate, name="Adam_supervised") gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops, name="gradients_adam") clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step, name="adam_apply_gradients") # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + gradient_norm_summary) # second part of the learning rate if self.mode == tf.contrib.learn.ModeKeys.TRAIN or self.mode == dialogue_utils.mode_self_play_mutable: self.learning_rate2 = tf.constant(hparams.learning_rate2) self.learning_rate3 = tf.constant(hparams.learning_rate3) if hparams.optimizer == "sgd": self.learning_rate2 = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate2, lambda: tf.train.exponential_decay(self.learning_rate2, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised2") self.learning_rate3 = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate3, lambda: tf.train.exponential_decay(self.learning_rate3, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="sgd_learning_rate_supervised3") tf.summary.scalar("self_play_lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate2 ) <= 0.001, "! High Adam learning rate2 %g" % hparams.learning_rate2 assert float( hparams.learning_rate3 ) <= 0.001, "! High Adam learning rate3 %g" % hparams.learning_rate3 # params=[] print("params=") for element in params: print(element.name) val1_params = self.patial_params( params, ["dynamic_seq2seq/value_network1"]) val2_params = self.patial_params( params, ["dynamic_seq2seq/value_network2"]) embedding_params = self.patial_params(params, ["embeddings"]) main_dec_enc_params1 = self.patial_params( params, ["dynamic_seq2seq/encoder1/", "dynamic_seq2seq/decoder1/"]) main_dec_enc_params2 = self.patial_params( params, ["dynamic_seq2seq/encoder2/", "dynamic_seq2seq/decoder2/"]) action_params = self.patial_params( params, ["dynamic_seq2seq/decoder_action"]) encoder_kb_params = self.patial_params( params, ["dynamic_seq2seq/encoder2_kb"]) encoder_intent_params = self.patial_params( params, ["dynamic_seq2seq/encoder1_intent"]) print("val1_params", "\n".join(map(lambda a: a.name, val1_params))) print("val2_params", "\n".join(map(lambda a: a.name, val2_params))) print("embedding_params", "\n".join(map(lambda a: a.name, embedding_params))) print("main_dec_enc_params1", "\n".join(map(lambda a: a.name, main_dec_enc_params1))) print("main_dec_enc_params2", "\n".join(map(lambda a: a.name, main_dec_enc_params2))) print("action_params", "\n".join(map(lambda a: a.name, action_params))) print("encoder_kb_params", "\n".join(map(lambda a: a.name, encoder_kb_params))) print("encoder_intent_params", "\n".join(map(lambda a: a.name, encoder_intent_params))) self.optimizer_vl1, self.v1_sum = self.generate_optimizer( self.vl1, params, "vl1", self.learning_rate2, self.hparams.max_gradient_norm2) self.optimizer_vl2, self.v2_sum = self.generate_optimizer( self.vl2, params, "vl2", self.learning_rate2, self.hparams.max_gradient_norm2) if hparams.self_play_variable_method == 0: rl_param1, rl_param2 = encoder_intent_params, encoder_kb_params + action_params elif hparams.self_play_variable_method == 1: rl_param1, rl_param2 = main_dec_enc_params1, main_dec_enc_params2 elif hparams.self_play_variable_method == 2: rl_param1, rl_param2 = main_dec_enc_params1 + encoder_intent_params, main_dec_enc_params2 + encoder_kb_params + action_params elif hparams.self_play_variable_method == 3: rl_param1, rl_param2 = [main_dec_enc_params1[0] ] + encoder_intent_params, [ main_dec_enc_params2[0] ] + encoder_kb_params elif hparams.self_play_variable_method == 4: rl_param1, rl_param2 = [main_dec_enc_params1[0] ], [main_dec_enc_params2[0]] elif hparams.self_play_variable_method == 5: rl_param1, rl_param2 = params, params self.optimizer_pl1, self.p1_sum = self.generate_optimizer( self.pl1, params, "pl1", self.learning_rate3, self.hparams.max_gradient_norm3) self.optimizer_pl2, self.p2_sum = self.generate_optimizer( self.pl2, params, "pl2", self.learning_rate3, self.hparams.max_gradient_norm3) print("self.learning", self.learning_rate, self.learning_rate2, self.learning_rate3) ################################ ### supervised learning######' ########################### # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def _set_train_or_infer(self, res, hparams): """Set up training.""" loss = res[1] if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = loss self.word_count = tf.reduce_sum( self.features["source_sequence_length"]) + tf.reduce_sum( self.features["target_sequence_length"]) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = loss elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits = res[0] self.infer_loss = loss self.sample_id = res[2] if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.features["target_sequence_length"]) # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. # Only build bprop if running on GPU and using dist_strategy, in which # case learning rate, grads and train_op are created in estimator model # function. with tf.name_scope("learning_rate"): self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) if (hparams.use_dist_strategy and self.mode == tf.contrib.learn.ModeKeys.TRAIN): # Gradients params = tf.trainable_variables() # Print trainable variables utils.print_out("# Trainable variables") utils.print_out( "Format: <name>, <shape>, <dtype>, <(soft) device placement>") for param in params: utils.print_out( " %s, %s, %s, %s" % (param.name, str( param.get_shape()), param.dtype.name, param.op.device)) utils.print_out("Total params size: %.2f GB" % (4. * np.sum([ p.get_shape().num_elements() for p in params if p.shape.is_fully_defined() ]) / 2**30)) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) assert opt is not None grads_and_vars = opt.compute_gradients( self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops ) gradients = [x for (x, _) in grads_and_vars] clipped_grads, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.params = params self.grads = clipped_grads self.update = opt.apply_gradients(list(zip(clipped_grads, params)), global_step=self.global_step) else: self.grad_norm = None self.update = None self.params = None self.grads = None
def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars( ) # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope( tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile( hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device( tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply( i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar( "clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams. enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams. fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops( grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out("All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out("All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out("master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold
def __init__(self, hparams, mode, iterator, vocab_table, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. vocab_table: Lookup table mapping source words to ids. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ self.iterator = iterator self.mode = mode self.vocab_table = vocab_table #self.vocab_size = len(vocab_table) self.time_major = hparams.time_major self.single_cell_fn = None # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(2, use_bias=False, activation=tf.nn.sigmoid, name="output_projection") ## Train graph loss, accuracy = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = loss self.train_accuracy = accuracy elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = loss self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=True) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables print("# Trainable variables") for param in params: print(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, hparams, mode, iterator, input_vocab_table=None): self.n_classes = hparams.n_classes self.vocab_size = hparams.vocab_size self.input_sequence_length = iterator.input_sequence_length self.mode = mode self.inputs = iterator.input self.targets = iterator.target self.input_vocab_table = input_vocab_table self.batch_size = iterator.batch_size # Initializer for all model parameters. initializer = tf.random_uniform_initializer(-hparams.init_weight, hparams.init_weight, seed=hparams.random_seed) tf.get_variable_scope().set_initializer(initializer) # Create embedding layer. self.input_embedding, self.input_emb_init, self.input_emb_placeholder = model_helper.create_embeddings \ (vocab_size=self.vocab_size, emb_size=hparams.input_emb_size, emb_trainable=hparams.input_emb_trainable, emb_pretrain=hparams.input_emb_pretrain) # build graph of rnn model. # res = self.build_graph(hparams) # Computing the log likelihood using tf.crf function log_likelihood, transition_params, logits = self.build_graph(hparams) self.transition_params = transition_params self.predictions = { "probabilities": self.compute_probabilities(logits), "labels": tf.cast(self.compute_labels(logits), tf.int32) } self.accuracy = self.compute_accuracy(self.predictions["labels"]) # Computing the training loss if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = tf.reduce_mean(-log_likelihood) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = tf.reduce_mean(-log_likelihood) # Calculate accuracy metric. self.logits = logits ## Learning rate print(" start_decay_step=%d, learning_rate=%g, decay_steps %d," " decay_factor %g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor)) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and sgd update operation for model training. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: # Optimizer if hparams.optimizer == "sgd": # perform SGD with a learning rate with exponential decay self.learning_rate = tf.cond( self.global_step < hparams.start_decay_step, lambda: tf.constant(hparams.learning_rate), lambda: tf.train.exponential_decay(hparams.learning_rate, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": self.learning_rate = tf.constant(hparams.learning_rate) opt = tf.train.AdamOptimizer(self.learning_rate) # compute the gradients of train_loss w.r.t to the model trainable parameters. # if colocate_gradients_with_ops is true, the gradients will be computed in the same gpu/cpu device with the # original (forward-pass) operator gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) # clip gradients below a threshold to avoid explosion clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm # ask the optimizer to apply the processed gradients. We give as argument a list of pairs (gradient,variable). self.update = opt.apply_gradients(zip(clipped_grads, params), global_step=self.global_step) self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) # Saver. As argument, we give the variables that are going to be saved and restored. # The Saver op will save the variables of the graph within it is defined. All graphs (train/eval/predict) # have a Saver operator. self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) # Print trainable variables print("# Trainable variables") for param in params: print(" %s, %s" % (param.name, str(param.get_shape()))) import numpy as np total_params = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) print("Total number of parameters: %d" % total_params)
def __init__(self, iterator, hps, mode, vocab_table,reverse_target_vocab_table=None, scope=None): self.init_iter = iterator.initializer self.hps = hps self.vocab_table = vocab_table self.reverse_target_vocab_table = reverse_target_vocab_table self.iterator = iterator self.use_test_set = False self.mode = mode self.single_cell_fn = None self.time_major = hps.time_major self.batch_size = hps.batch_size # self._output_layer = layers_core.Dense( # self._vocab[1], use_bias=False, name="output_projection") # self.start_decoding = tf.cast(vocab_table.lookup(tf.constant(hps.START_DECODING)), tf.int32) # self.stop_decoding = tf.cast(vocab_table.lookup(tf.constant(hps.STOP_DECODING)), tf.int32) #init # self.rand_unif_init = tf.random_uniform_initializer(-hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) # # self.trunc_norm_init = tf.truncated_normal_initializer(stddev=hps.trunc_norm_init_std) self.init_embeddings(hps, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hps.vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hps, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) if (len(res) > 4): self.coverage_loss = res[4] else: self.coverage_loss = tf.constant(0) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: if (len(res) > 4): self.infer_logits, _, self.final_context_state, self.sample_id, _ = res else: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hps.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hps) # decay self.learning_rate = self._get_learning_rate_decay(hps) # Optimizer if hps.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hps.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients( self.train_loss, params, colocate_gradients_with_ops=hps.colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hps.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients( zip(clipped_grads, params), global_step=self.global_step) # Summary # Summary if (self.coverage_loss is not None): self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), tf.summary.scalar("coverage_loss", self.coverage_loss) ] + grad_norm_summary) else: self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss) ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hps) # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def __init__(self, iterator, hparams, mode, scope=None): self.iterator = iterator self.hparams = hparams self.mode = mode self.scope = scope # Initializer initializer = model_helper.get_initializer(self.hparams.init_op, None, self.hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings with tf.variable_scope(scope or 'embedding'): self.embedding = tf.get_variable( 'embedding', [self.hparams.vocab_size, self.hparams.num_units], dtype=tf.float32) # Output Layer with tf.variable_scope(scope or "build_network"): with tf.variable_scope('decoder/output_projection'): self.output_layer = tf.layers.Dense(self.hparams.vocab_size, use_bias=False) # Batch Size self.batch_size = tf.size(self.iterator.src_seq) # Build Graph print("# Building graph for the model ...") res = self.build_graph(self.scope) if self.mode == TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( tf.reduce_sum(self.iterator.src_seq) + tf.reduce_sum(self.iterator.tar_seq)) elif self.mode == EVAL: self.eval_loss = res[1] elif self.mode == PREDICT: self.infer_logits, _, self.final_state, self.sample_id = res if self.mode != PREDICT: # Count the number of predicted words for compute perplexity. self.predict_count = tf.reduce_sum(self.iterator.tar_seq) # Define variables self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Optimizer if self.mode == TRAIN: self.learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') # self.learning_rate = tf.train.exponential_decay( # 0.001, self.global_step, 1000, 0.9) opt = tf.train.AdamOptimizer(self.learning_rate) # Gradient gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=self.hparams. colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary, _ = model_helper.gradient_clip( gradients, self.hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar('train_loss', self.train_loss), tf.summary.scalar('learning_rate', self.learning_rate) ] + gradient_norm_summary) else: self.infer_summary = tf.no_op() # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.hparams.max_to_keep)
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings # TODO(ebrevdo): Only do this if the mode is TRAIN? self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) ## Learning rate warmup_steps = hparams.learning_rate_warmup_steps warmup_factor = hparams.learning_rate_warmup_factor print(" start_decay_step=%d, learning_rate=%g, decay_steps %d, " "decay_factor %g, learning_rate_warmup_steps=%d, " "learning_rate_warmup_factor=%g, starting_learning_rate=%g" % (hparams.start_decay_step, hparams.learning_rate, hparams.decay_steps, hparams.decay_factor, warmup_steps, warmup_factor, (hparams.learning_rate * warmup_factor**warmup_steps))) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # Apply inverse decay if global steps less than warmup steps. # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3) # When step < warmup_steps, # learing_rate *= warmup_factor ** (warmup_steps - step) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) self.learning_rate = tf.cond( self.global_step < hparams.learning_rate_warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_decay_warump_cond") if hparams.optimizer == "sgd": self.learning_rate = tf.cond( self.global_step < hparams.start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay(self.learning_rate, ( self.global_step - hparams.start_decay_step), hparams.decay_steps, hparams.decay_factor, staircase=True), name="learning_rate") opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": assert float( hparams.learning_rate ) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + gradient_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))