Beispiel #1
0
 def fprop_and_bprop(tid):
     """docstring."""
     model = gnmt_model.GNMTModel(hparams,
                                  mode=mode,
                                  features=tower_features[tid])
     # sync training.
     assert model.learning_rate is not None
     # The following handles shouldn't be built in when doing manual
     assert model.grad_norm is None
     assert model.update is None
     tower_loss = model.train_loss
     # Only check loss numerics if in fp16
     if hparams.use_fp16 and hparams.check_tower_loss_numerics:
         tower_loss = tf.check_numerics(
             tower_loss, "tower_%d has Inf/NaN loss" % tid)
     # Cast to fp32, otherwise would easily overflow.
     tower_loss = tf.to_float(tower_loss)
     var_params, grads, opt = self._compute_tower_grads(
         tower_loss,
         var_mgr.trainable_variables_on_device(tid, tid),
         model.learning_rate,
         use_fp16=hparams.use_fp16,
         loss_scale=loss_scale,
         colocate_gradients_with_ops=hparams.
         colocate_gradients_with_ops)
     self._print_varinfo(var_params, tid)
     res = [model.train_loss, model.learning_rate, model.batch_size]
     res.extend(grads)
     opts.append(opt)
     return res
Beispiel #2
0
    def build_graph_dist_strategy(self, features, labels, mode, params):
        """Model function."""
        del labels, params
        misc_utils.print_out("Running dist_strategy mode_fn")

        hparams = self.hparams

        # Create a GNMT model for training.
        # assert (hparams.encoder_type == "gnmt" or
        #        hparams.attention_architecture in ["gnmt", "gnmt_v2"])
        with mixed_precision_scope():
            model = gnmt_model.GNMTModel(hparams, mode=mode, features=features)
            if mode == tf.contrib.learn.ModeKeys.INFER:
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
            elif mode == tf.contrib.learn.ModeKeys.TRAIN:
                loss = model.train_loss
                train_op = model.update
                return loss, model.params, model.grads, None, train_op, None
            else:
                raise ValueError("Unknown mode in model_fn: %s" % mode)
Beispiel #3
0
  def _model_fn(features, labels, mode, params):
    """Model function."""
    del labels, params
    # Create a GNMT model for training.
    # assert (hparams.encoder_type == "gnmt" or
    #        hparams.attention_architecture in ["gnmt", "gnmt_v2"])
    model = gnmt_model.GNMTModel(hparams, mode=mode, features=features)
    if mode == tf.contrib.learn.ModeKeys.INFER:
      predicted_ids = model.predicted_ids
      # make sure outputs is of shape [batch_size, time] or [beam_width,
      # batch_size, time] when using beam search.
      if hparams.time_major:
        predicted_ids = tf.transpose(predicted_ids, [2, 1, 0])
      elif predicted_ids.shape.ndims == 3:
        # beam search output in [batch_size, time, beam_width] shape.
        predicted_ids = tf.transpose(predicted_ids, [2, 0, 1])
      # Get the top predictions from beam search.
      predicted_ids = tf.gather_nd(predicted_ids, [0])
      predictions = {"predictions": predicted_ids}
      return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions)

    elif mode == tf.contrib.learn.ModeKeys.TRAIN:
      loss = tf.zeros([], dtype=tf.float32)
      train_op = model.update

    else:
      raise ValueError("Unknown mode in model_fn: %s" % mode)

    if hparams.use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, loss=loss, train_op=train_op)
    else:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    def _model_fn(features, labels, mode, params):
        """Model function."""
        del labels, params
        # Create a GNMT model for training.
        # assert (hparams.encoder_type == "gnmt" or
        #        hparams.attention_architecture in ["gnmt", "gnmt_v2"])
        model = gnmt_model.GNMTModel(hparams, mode=mode, features=features)
        if mode == tf.contrib.learn.ModeKeys.INFER:
            predicted_ids = model.predicted_ids
            # make sure outputs is of shape [batch_size, time] or [beam_width,
            # batch_size, time] when using beam search.
            if hparams.time_major:
                predicted_ids = tf.transpose(predicted_ids, [2, 1, 0])
            elif predicted_ids.shape.ndims == 3:
                # beam search output in [batch_size, time, beam_width] shape.
                predicted_ids = tf.transpose(predicted_ids, [2, 0, 1])
            # Get the top predictions from beam search.
            predicted_ids = tf.gather_nd(predicted_ids, [0])
            predictions = {"predictions": predicted_ids}
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)

        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            loss = model.loss
            train_op = model.update
        else:
            raise ValueError("Unknown mode in model_fn: %s" % mode)

        def host_call_fn(gs, loss, lr):
            gs = gs[0]
            with tf.contrib.summary.create_file_writer(
                    hparams.model_dir).as_default():
                with tf.contrib.summary.always_record_summaries():
                    tf.contrib.summary.scalar('loss', loss[0], step=gs)
                    tf.contrib.summary.scalar('learning_rate', lr[0], step=gs)

                    return tf.contrib.summary.all_summary_ops()

        gs_t = tf.reshape(tf.train.get_global_step(), [1])
        loss_t = tf.reshape(model.loss, [1])
        lr_t = tf.reshape(model.learning_rate, [1])
        host_call = (host_call_fn, [gs_t, loss_t, lr_t])

        if hparams.use_tpu:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op,
                                                   training_hooks=hooks,
                                                   host_call=host_call)
        else:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
Beispiel #5
0
    def build_graph(self, features, labels, mode, params):
        """docstring."""
        del labels, params
        misc_utils.print_out("Running fast mode_fn")

        hparams = self.hparams

        # Create global_step
        tf.train.get_or_create_global_step()

        if mode == tf.contrib.learn.ModeKeys.INFER:
            # Doing inference only on one GPU
            inf_hparams = tf.contrib.training.HParams(**hparams.values())
            inf_hparams.set_hparam("num_gpus", 1)
            # Inference is done in fp32 and in the same way as that of dist_strategy.
            inf_hparams.set_hparam("use_fp16", False)

            misc_utils.print_out("inference hparmas:")
            misc_utils.print_hparams(inf_hparams)

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(inf_hparams)

            with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope(
                    "tower_0"), var_mgr.create_outer_variable_scope(0):
                model = gnmt_model.GNMTModel(inf_hparams,
                                             mode=mode,
                                             features=features)
                sample_ids = model.sample_id
                reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file(
                    inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK)
                sample_words = reverse_target_vocab_table.lookup(
                    tf.to_int64(sample_ids))
                # make sure outputs is of shape [batch_size, time] or [beam_width,
                # batch_size, time] when using beam search.
                if inf_hparams.time_major:
                    sample_words = tf.transpose(sample_words)
                elif sample_words.shape.ndims == 3:
                    # beam search output in [batch_size, time, beam_width] shape.
                    sample_words = tf.transpose(sample_words, [2, 0, 1])
                predictions = {"predictions": sample_words}
                # return loss, vars, grads, predictions, train_op, scaffold
                return None, None, None, predictions, None, None
        elif mode == tf.contrib.learn.ModeKeys.TRAIN:
            num_towers = hparams.num_gpus
            # Shard inputs
            tower_features = self._shard_inputs(features, num_towers)
            # Create loss scale vars if necessary
            loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars(
            )

            # Create variable_mgr
            var_mgr = self._get_variable_mgr(hparams)

            # Build per-tower fprop and bprop
            devices = var_mgr.get_devices()
            tower_gradvars = []
            tower_scopes = []
            var_scopes = []
            train_losses = []
            learning_rates = []
            batch_sizes = []
            opts = []

            def fprop_and_bprop(tid):
                """docstring."""
                model = gnmt_model.GNMTModel(hparams,
                                             mode=mode,
                                             features=tower_features[tid])
                # sync training.
                assert model.learning_rate is not None
                # The following handles shouldn't be built in when doing manual
                assert model.grad_norm is None
                assert model.update is None
                tower_loss = model.train_loss
                # Only check loss numerics if in fp16
                if hparams.use_fp16 and hparams.check_tower_loss_numerics:
                    tower_loss = tf.check_numerics(
                        tower_loss, "tower_%d has Inf/NaN loss" % tid)
                # Cast to fp32, otherwise would easily overflow.
                tower_loss = tf.to_float(tower_loss)
                var_params, grads, opt = self._compute_tower_grads(
                    tower_loss,
                    var_mgr.trainable_variables_on_device(tid, tid),
                    model.learning_rate,
                    use_fp16=hparams.use_fp16,
                    loss_scale=loss_scale,
                    colocate_gradients_with_ops=hparams.
                    colocate_gradients_with_ops)
                self._print_varinfo(var_params, tid)
                res = [model.train_loss, model.learning_rate, model.batch_size]
                res.extend(grads)
                opts.append(opt)
                return res

            def unpack_fprop_and_bprop_output(output):
                train_loss = output[0]
                learning_rate = output[1]
                batch_size = output[2]
                grads = output[3:]
                return train_loss, learning_rate, batch_size, grads

            with mixed_precision_scope():
                for tid in range(num_towers):
                    with tf.device(devices[tid % len(devices)]), tf.name_scope(
                            "tower_%s" % tid) as scope:
                        tower_scopes.append(scope)
                        with var_mgr.create_outer_variable_scope(
                                tid) as var_scope:
                            var_scopes.append(var_scope)

                            outputs = maybe_xla_compile(
                                hparams, fprop_and_bprop, tid)
                            (train_loss, learning_rate, batch_size,
                             grads) = unpack_fprop_and_bprop_output(outputs)
                            train_losses.append(train_loss)
                            learning_rates.append(learning_rate)
                            batch_sizes.append(batch_size)
                            var_params = var_mgr.trainable_variables_on_device(
                                tid, tid)
                            tower_gradvars.append(list(zip(grads, var_params)))

            # Add summaries
            if hparams.show_metrics:
                tf.summary.scalar("learning_rate", learning_rates[0])
                if loss_scale:
                    tf.summary.scalar("loss_scale", loss_scale)
                    if hparams.enable_auto_loss_scale:
                        tf.summary.scalar("loss_scale_normal_steps",
                                          loss_scale_normal_steps)
            misc_utils.print_out("Finish building fprop and per-tower bprop.")
            # Aggregate gradients
            # The following compute the aggregated grads for each tower, stored in
            # opaque grad_states structure.
            apply_grads_devices, grad_states = var_mgr.preprocess_device_grads(
                tower_gradvars)
            master_grads = None
            master_params = None
            update_ops = []
            for i, device in enumerate(apply_grads_devices):
                with tf.device(device), tf.name_scope(tower_scopes[i]):
                    # Get per-tower grads.
                    with tf.name_scope("get_gradients_to_apply"):
                        avg_gradvars = var_mgr.get_gradients_to_apply(
                            i, grad_states)
                    avg_grads = [gv[0] for gv in avg_gradvars]

                    # gradients post-processing
                    with tf.name_scope("clip_gradients"):
                        if hparams.clip_grads:
                            clipped_grads, grad_norm = model_helper.gradient_clip(
                                avg_grads,
                                max_gradient_norm=hparams.max_gradient_norm)
                            # summary the grad on the 1st tower
                            if i == 0 and hparams.show_metrics:
                                tf.summary.scalar("grad_norm", grad_norm)
                                tf.summary.scalar(
                                    "clipped_grad_norm",
                                    tf.global_norm(clipped_grads))
                        else:
                            clipped_grads = avg_grads
                        if i == 0:
                            master_grads = clipped_grads

                    # Build apply-gradients ops
                    clipped_gradvars = list(
                        zip(clipped_grads, [gv[1] for gv in avg_gradvars]))
                    if i == 0:
                        master_params = [gv[1] for gv in avg_gradvars]
                    with tf.name_scope("append_gradient_ops"):
                        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
                            enable_auto_loss_scale=hparams.
                            enable_auto_loss_scale,
                            loss_scale=loss_scale,
                            loss_scale_normal_steps=loss_scale_normal_steps,
                            inc_loss_scale_every_n=hparams.
                            fp16_inc_loss_scale_every_n,
                            is_chief=True)
                        opt = opts[i]
                        var_mgr.append_apply_gradients_ops(
                            grad_states, opt, clipped_gradvars, update_ops,
                            loss_scale_params)
            misc_utils.print_out("Finish building grad aggregation.")

            assert len(update_ops) == num_towers
            train_op = tf.group(update_ops)
            with tf.control_dependencies([train_op]):
                global_step = tf.train.get_global_step()
                train_op = global_step.assign_add(1)

            # Compute loss on the first gpu
            # TODO(jamesqin): optimize it?
            with tf.device("gpu:0"):
                loss = misc_utils.weighted_avg(train_losses, batch_sizes)

            # Create local init_ops
            # TODO(jamesqin): handle resource variables!
            # At present if not using mirror strategy, not using resource vars.
            local_init_ops = []
            local_init_op = tf.local_variables_initializer()
            with tf.control_dependencies([local_init_op]):
                local_init_ops.append(var_mgr.get_post_init_ops())
            local_init_ops.extend([local_init_op, tf.tables_initializer()])

            saveable_vars = var_mgr.savable_variables()
            # Add saveables for cudnn vars in master tower.
            saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
            saveable_objects = [x for x in saveable_objects if "v0" in x.name]

            misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars))
            for mv in saveable_vars:
                misc_utils.print_out(mv.name)

            misc_utils.print_out("All global trainable vars(%d): " %
                                 len(tf.trainable_variables()))
            for tv in tf.trainable_variables():
                misc_utils.print_out(tv.name)

            misc_utils.print_out("All global vars(%d): " %
                                 len(tf.global_variables()))
            for gv in tf.global_variables():
                misc_utils.print_out(gv.name)

            misc_utils.print_out("master backproped params(%d): " %
                                 len(master_params))
            for mp in master_params:
                misc_utils.print_out(mp.name)

            # Note the cudnn vars are skipped the init check. :(
            scaffold = tf.train.Scaffold(
                ready_op=tf.report_uninitialized_variables(saveable_vars),
                ready_for_local_init_op=tf.report_uninitialized_variables(
                    saveable_vars),
                local_init_op=tf.group(*local_init_ops),
                saver=tf.train.Saver(saveable_vars + saveable_objects,
                                     save_relative_paths=True))

            misc_utils.print_out("Finish building model_fn")
            # return loss, vars, grads, predictions, train_op, scaffold
            return loss, master_params, master_grads, None, train_op, scaffold