def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( my_hp, mode, my_hp.problems[n], n, dp, devices.ps_devices(all_workers=True)) if mode == tf.contrib.learn.ModeKeys.INFER: return model_class.infer( features, beam_size=FLAGS.decode_beam_size, top_beams=(FLAGS.decode_beam_size if FLAGS.decode_return_beams else 1), last_position_only=FLAGS.decode_use_last_position_only, alpha=FLAGS.decode_alpha, decode_length=FLAGS.decode_extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = my_hp.problem_choice == "distributed" and train problem_worker_id = FLAGS.worker_id % len(my_hp.problems) skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) if (FLAGS.eval_run_autoregressive and mode == tf.contrib.learn.ModeKeys.EVAL): sharded_logits, losses_dict = model_class.eval_autoregressive( features) else: sharded_logits, losses_dict = model_class.model_fn( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable(loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope( "train_stats"): # Count steps for this problem. problem_steps = tf.get_variable("problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss ] + sharded_logits # Need to flatten for cond later.
def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams) if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer( features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: logits, losses_dict = model_class.eval_autoregressive(features) else: logits, losses_dict = model_class( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable( loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable( "problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable( "problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, logits]
def __init__(self, t2t_usr_dir, src_vocab_size, trg_vocab_size, model_name, problem_name, hparams_set_name, checkpoint_dir, t2t_unk_id=None, single_cpu_thread=False): """Creates a new T2T predictor. The constructor prepares the TensorFlow session for predict_next() calls. This includes: - Load hyper parameters from the given set (hparams) - Update registry, load T2T model - Create TF placeholders for source sequence and target pefix - Create computation graph for computing log probs. - Create a MonitoredSession object, which also handles restoring checkpoints. Args: t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor. src_vocab_size (int): Source vocabulary size. trg_vocab_size (int): Target vocabulary size. model_name (string): T2T model name. problem_name (string): T2T problem name. hparams_set_name (string): T2T hparams set name. checkpoint_dir (string): Path to the T2T checkpoint directory. The predictor will load the top most checkpoint in the `checkpoints` file. t2t_unk_id (int): If set, use this ID to get UNK scores. If None, UNK is always scored with -inf. single_cpu_thread (bool): If true, prevent tensorflow from doing multithreading. """ super(T2TPredictor, self).__init__(t2t_usr_dir, checkpoint_dir, t2t_unk_id, single_cpu_thread) self.consumed = [] self.src_sentence = [] predictor_graph = tf.Graph() with predictor_graph.as_default() as g: hparams = self._create_hparams(src_vocab_size, trg_vocab_size, hparams_set_name, problem_name) p_hparams = hparams.problems[0] self._inputs_var = tf.placeholder(dtype=tf.int32, shape=[None], name="sgnmt_inputs") self._targets_var = tf.placeholder(dtype=tf.int32, shape=[None], name="sgnmt_targets") def expand_input_dims_for_t2t(t): t = tf.expand_dims(t, 0) # Because of batch_size t = tf.expand_dims(t, -1) # Because of modality t = tf.expand_dims(t, -1) # Because of random reason X return t features = { "problem_choice": tf.constant(0), "input_space_id": tf.constant(p_hparams.input_space_id), "target_space_id": tf.constant(p_hparams.target_space_id), "inputs": expand_input_dims_for_t2t(self._inputs_var), "targets": expand_input_dims_for_t2t(self._targets_var) } model = registry.model(model_name)( hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0, devices.data_parallelism(), devices.ps_devices(all_workers=True)) sharded_logits, _ = model.model_fn(features, last_position_only=True) self._log_probs = log_prob_from_logits(sharded_logits[0]) self.mon_sess = self.create_session()
def __init__(self, src_vocab_size, trg_vocab_size, model_name, problem_name, hparams_set_name, t2t_usr_dir, checkpoint_dir, t2t_unk_id=None, single_cpu_thread=False, max_terminal_id=-1, pop_id=-1): """Creates a new simultaneous T2T predictor. The constructor prepares the TensorFlow session for predict_next() calls. This includes: - Load hyper parameters from the given set (hparams) - Update registry, load T2T model - Create TF placeholders for source sequence and target prefix - Create computation graph for computing log probs. - Create a MonitoredSession object, which also handles restoring checkpoints. Args: src_vocab_size (int): Source vocabulary size. trg_vocab_size (int): Target vocabulary size. model_name (string): T2T model name. problem_name (string): T2T problem name. hparams_set_name (string): T2T hparams set name. t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor. checkpoint_dir (string): Path to the T2T checkpoint directory. The predictor will load the top most checkpoint in the `checkpoints` file. t2t_unk_id (int): If set, use this ID to get UNK scores. If None, UNK is always scored with -inf. single_cpu_thread (bool): If true, prevent tensorflow from doing multithreading. max_terminal_id (int): If positive, maximum terminal ID. Needs to be set for syntax-based T2T models. pop_id (int): If positive, ID of the POP or closing bracket symbol. Needs to be set for syntax-based T2T models. """ super(SimT2TPredictor_v2, self).__init__(t2t_usr_dir, checkpoint_dir, t2t_unk_id, single_cpu_thread) self.consumed = [] self.src_sentence = [] self.pop_id = pop_id self.max_terminal_id = max_terminal_id self.previous_encode = -1 self.previous_decode = -1 predictor_graph = tf.Graph() with predictor_graph.as_default() as g: hparams = self._create_hparams(src_vocab_size, trg_vocab_size, hparams_set_name, problem_name) p_hparams = hparams.problems[0] self._inputs_var = tf.placeholder(dtype=tf.int32, shape=[None], name="sgnmt_inputs") self._targets_var = tf.placeholder(dtype=tf.int32, shape=[None], name="sgnmt_targets") features = { "problem_choice": tf.constant(0), "input_space_id": tf.constant(p_hparams.input_space_id), "target_space_id": tf.constant(p_hparams.target_space_id), "inputs": expand_input_dims_for_t2t(self._inputs_var), "targets": expand_input_dims_for_t2t(self._targets_var) } model = registry.model(model_name)( hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0, devices.data_parallelism(), devices.ps_devices(all_workers=True)) sharded_logits, _ = model.model_fn(features) self._log_probs = log_prob_from_logits(sharded_logits[0]) self._encoder_output = model.encoder_output self._encoder_decoder_attention_bias = model.attention_bias self._decoder_output = model.decoder_output self.mon_sess = self.create_session()
def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams ) # initialize transformer model class: hparams, modalities if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer(features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) mrt_samples = getattr(hparams, 'mrt_samples', None) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: # evaluation mode sharded_logits, losses_dict = model_class.eval_autoregressive( features) else: # training mode if hparams.rl: # generate sample data, it will automatically sharded, samples shape [batch, time, 1, 1] if model_class._num_datashards == 1: # work on single GPU cards, fast sample print("###Work on Single GPU card, Use Fast Decode.###") train_beam = getattr(hparams, 'train_beam', None) if mrt_samples: samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=mrt_samples, top_beams=mrt_samples) inputs = tf.squeeze(tf.squeeze(features["inputs"], axis=-1), axis=-1) targets = tf.squeeze(tf.squeeze(features["targets"], axis=-1), axis=-1) batch_size = tf.shape(inputs)[0] inputs_len = tf.shape(inputs)[1] targets_len = tf.shape(targets)[1] inputs_tile = tf.tile(inputs, [1, mrt_samples]) targets_tile = tf.tile(targets, [1, mrt_samples]) inputs_reshape = tf.reshape( inputs_tile, [batch_size * mrt_samples, inputs_len]) targets_reshape = tf.reshape( targets_tile, [batch_size * mrt_samples, targets_len]) inputs_feed = tf.expand_dims(tf.expand_dims( inputs_reshape, axis=-1), axis=-1) targets_feed = tf.expand_dims(tf.expand_dims( targets_reshape, axis=-1), axis=-1) features["inputs"] = inputs_feed features["targets"] = targets_feed elif train_beam and train_beam != 1: # beam search with hparams.train_beam size and return the top 1 sample samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=hparams.train_beam) else: targets_beam = getattr(hparams, 'targets_beam', None) if targets_beam: targets_samples, _ = model_class._fast_decode( features, decode_length=50, beam_size=4, sampling_method='argmax') targets_samples = tf.reshape( targets_samples, [ tf.shape(targets_samples)[0], tf.shape(targets_samples)[1], 1, 1 ]) features["targets"] = targets_samples samples, _ = model_class._fast_decode(features, decode_length=50) samples = tf.expand_dims(samples, axis=-1) samples = tf.expand_dims( samples, axis=-1 ) # add two additional dimensions to make it compatible. else: # work on multi GPU cards, only support slow sample print("###Work on Multi GPU cards, Use Slow Decode.###") samples, _, _ = model_class._slow_greedy_infer( features, decode_length=50) # default decode_length = 50 samples = tf.stop_gradient(samples) # calculate bleu score use metric_fn # train_metric_fn = "approx_bleu_train_score" train_metric_fn = metrics.METRICS_FNS[ metrics.Metrics.APPROX_BLEU_TRAIN] labels = features.get("targets", None) samples.set_shape([None, None, 1, 1]) # haprams.delta_reward = True for delta reward; False for total reward metric_value = train_metric_fn( samples, labels, delat_reward=hparams.delta_reward) metric_value = tf.stop_gradient( metric_value) # to be more strict of the gradient metric_value.set_shape([None, None, 1, 1]) """Accodring to the metrics.py: The tf.metrics.mean function assures correct aggregation.""" # metric_value is total_reward: scalar features["samples"] = samples features["values"] = metric_value # del samples # del labels sharded_logits, losses_dict = model_class.model_fn( features, skip=(skipping_is_on and skip_this_one), mrt=mrt_samples) # if hparams.rl: # training_loss = losses_dict["training"] * metric_value # losses_dict["training"]: [batch, timesteps] # training_loss_sum = tf.reduce_sum(training_loss) # sum the training_loss # losses_dict["training"] = training_loss_sum # log_prob * r (current r is total_reward) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): if hparams.rl: baseline_loss_weight = getattr(hparams, 'baseline_loss_weight', 1.0) training_loss_weight = getattr(hparams, 'training_loss_weight', 1.0) mle_training_loss_weight = getattr( hparams, 'mle_training_loss_weight', 0.3) if loss_key == "training": loss_value = loss_value * training_loss_weight elif loss_key == "training_baseline": loss_value = loss_value * baseline_loss_weight elif loss_key == "mle_training": loss_value = loss_value * mle_training_loss_weight loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable(loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable("problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, tf.concat(sharded_logits, 0)]