def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res
def build_graph_dist_strategy(self, features, labels, mode, params): """Model function.""" del labels, params misc_utils.print_out("Running dist_strategy mode_fn") hparams = self.hparams # Create a GNMT model for training. # assert (hparams.encoder_type == "gnmt" or # hparams.attention_architecture in ["gnmt", "gnmt_v2"]) with mixed_precision_scope(): model = gnmt_model.GNMTModel(hparams, mode=mode, features=features) if mode == tf.contrib.learn.ModeKeys.INFER: sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: loss = model.train_loss train_op = model.update return loss, model.params, model.grads, None, train_op, None else: raise ValueError("Unknown mode in model_fn: %s" % mode)
def _model_fn(features, labels, mode, params): """Model function.""" del labels, params # Create a GNMT model for training. # assert (hparams.encoder_type == "gnmt" or # hparams.attention_architecture in ["gnmt", "gnmt_v2"]) model = gnmt_model.GNMTModel(hparams, mode=mode, features=features) if mode == tf.contrib.learn.ModeKeys.INFER: predicted_ids = model.predicted_ids # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if hparams.time_major: predicted_ids = tf.transpose(predicted_ids, [2, 1, 0]) elif predicted_ids.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. predicted_ids = tf.transpose(predicted_ids, [2, 0, 1]) # Get the top predictions from beam search. predicted_ids = tf.gather_nd(predicted_ids, [0]) predictions = {"predictions": predicted_ids} return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.contrib.learn.ModeKeys.TRAIN: loss = tf.zeros([], dtype=tf.float32) train_op = model.update else: raise ValueError("Unknown mode in model_fn: %s" % mode) if hparams.use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def _model_fn(features, labels, mode, params): """Model function.""" del labels, params # Create a GNMT model for training. # assert (hparams.encoder_type == "gnmt" or # hparams.attention_architecture in ["gnmt", "gnmt_v2"]) model = gnmt_model.GNMTModel(hparams, mode=mode, features=features) if mode == tf.contrib.learn.ModeKeys.INFER: predicted_ids = model.predicted_ids # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if hparams.time_major: predicted_ids = tf.transpose(predicted_ids, [2, 1, 0]) elif predicted_ids.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. predicted_ids = tf.transpose(predicted_ids, [2, 0, 1]) # Get the top predictions from beam search. predicted_ids = tf.gather_nd(predicted_ids, [0]) predictions = {"predictions": predicted_ids} return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) elif mode == tf.contrib.learn.ModeKeys.TRAIN: loss = model.loss train_op = model.update else: raise ValueError("Unknown mode in model_fn: %s" % mode) def host_call_fn(gs, loss, lr): gs = gs[0] with tf.contrib.summary.create_file_writer( hparams.model_dir).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss', loss[0], step=gs) tf.contrib.summary.scalar('learning_rate', lr[0], step=gs) return tf.contrib.summary.all_summary_ops() gs_t = tf.reshape(tf.train.get_global_step(), [1]) loss_t = tf.reshape(model.loss, [1]) lr_t = tf.reshape(model.learning_rate, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t]) if hparams.use_tpu: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=hooks, host_call=host_call) else: return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars( ) # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope( tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile( hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device( tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply( i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar( "clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams. enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams. fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops( grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out("All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out("All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out("master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold