Beispiel #1
0
    def test_get_results_for_epoch(self):
        # this will take all gpu memory, but that's probably fine for tests
        gpus = get_available_gpus()
        length_list = []
        for num_gpus in [1, 2, 3]:
            if num_gpus > len(gpus):
                continue
            for bs in [1, 2, 3, 5, 7]:
                if bs * num_gpus > 10:
                    continue
                with tf.Graph().as_default() as g:
                    self.eval_config['batch_size_per_gpu'] = bs
                    self.eval_config['num_gpus'] = num_gpus
                    model = base_model(params=self.eval_config,
                                       mode="infer",
                                       hvd=None)
                    model.compile()
                    model.infer = lambda inputs, outputs: inputs
                    model.finalize_inference = lambda results: results

                    with self.test_session(g, use_gpu=True) as sess:
                        sess.run(tf.global_variables_initializer())
                        inputs_per_batch = get_results_for_epoch(
                            model, sess, False, "infer")
                        length = np.hstack([
                            inp['source_tensors'][1]
                            for inp in inputs_per_batch
                        ])
                        ids = np.hstack(
                            [inp['source_ids'] for inp in inputs_per_batch])
                        length_list.append(length[np.argsort(ids)])

        for i in range(len(length_list) - 1):
            npt.assert_allclose(length_list[i], length_list[i + 1])
Beispiel #2
0
def restore_and_get_results(model, checkpoint, mode):
    if not model.params.get("use_trt", False):
        # Checkpoint is restored prior to freezing graph when using TRT
        saver = tf.train.Saver()
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    # pylint: disable=no-member
    sess_config.gpu_options.allow_growth = True
    if model.hvd:
        # pylint: disable=no-member
        sess_config.gpu_options.visible_device_list = str(
            model.hvd.local_rank())
    with tf.Session(config=sess_config) as sess:
        if not model.params.get("use_trt", False):
            assign_ops, restore_dict = get_assign_ops_and_restore_dict(
                checkpoint, True)
            if assign_ops:
                run_assign_and_saver(sess, checkpoint, assign_ops,
                                     restore_dict)
            else:
                saver = tf.train.Saver()
                saver.restore(sess, checkpoint)
        results_per_batch = get_results_for_epoch(
            model,
            sess,
            mode=mode,
            compute_loss=False,
            verbose=True,
        )
    return results_per_batch
Beispiel #3
0
  def test_get_batches_for_epoch(self):
    # this will take all gpu memory, but that's probably fine for tests
    gpus = get_available_gpus()
    length_list = []
    for num_gpus in [1, 2, 3]:
      if num_gpus > len(gpus):
        continue
      for bs in [1, 2, 3, 5, 7]:
        if bs * num_gpus > 10:
          continue
        with tf.Graph().as_default() as g:
          self.eval_config['batch_size_per_gpu'] = bs
          self.eval_config['num_gpus'] = num_gpus
          model = base_model(params=self.eval_config, mode="eval", hvd=None)
          model.compile()
          model.evaluate = lambda inputs, outputs: inputs
          model.finalize_evaluation = lambda results: results

          with self.test_session(g, use_gpu=True) as sess:
            sess.run(tf.global_variables_initializer())
            inputs_per_batch = get_results_for_epoch(model, sess, False, "eval")
            length_list.append(np.hstack([inp['source_tensors'][1]
                                          for inp in inputs_per_batch]))

    for i in range(len(length_list) - 1):
      npt.assert_allclose(length_list[i], length_list[i + 1])
Beispiel #4
0
    def after_run(self, run_context, run_values):
        results, step = run_values.results
        self._iter_count = step

        if not self._triggered and step != self._last_step - 1:
            return
        self._timer.update_last_triggered_step(self._iter_count - 1)

        if not self._model.on_horovod or self._model.hvd.rank() == 0:
            deco_print("Running evaluation on a validation set:")

        results_per_batch, total_loss = get_results_for_epoch(
            self._model,
            run_context.session,
            mode="eval",
            compute_loss=True,
            detailed_inference_outputs=False,
        )

        if not self._model.on_horovod or self._model.hvd.rank() == 0:
            if self._print_ppl:
                deco_print(
                    "Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}".
                    format(total_loss, math.exp(total_loss),
                           total_loss / math.log(2)),
                    offset=4)
            else:
                deco_print("Validation loss: {:.4f} ".format(total_loss),
                           offset=4)

            dict_to_log = self._model.finalize_evaluation(
                results_per_batch, step)
            dict_to_log['eval_loss'] = total_loss

            if self._print_ppl:
                # Add bpc and ppl metrics to tensorboard
                dict_to_log['ppl'] = math.exp(total_loss)
                dict_to_log['bpc'] = math.exp(total_loss / math.log(2))

            # saving the best validation model
            if self._model.params['save_checkpoint_steps'] and \
               total_loss < self._best_eval_loss:
                self._best_eval_loss = total_loss
                self._eval_saver.save(
                    run_context.session,
                    os.path.join(self._model.params['logdir'], 'best_models',
                                 'val_loss={:.4f}-step'.format(total_loss)),
                    global_step=step + 1,
                )

            # optionally logging to tensorboard any values
            # returned from maybe_print_logs
            if self._model.params['save_summaries_steps']:
                log_summaries_from_dict(
                    dict_to_log,
                    self._model.params['logdir'],
                    step,
                )
Beispiel #5
0
def restore_and_get_results(model, checkpoint, mode):
  saver = tf.train.Saver()
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  if model.hvd:
    sess_config.gpu_options.visible_device_list = str(model.hvd.local_rank())
  with tf.Session(config=sess_config) as sess:
    saver.restore(sess, checkpoint)
    results_per_batch = get_results_for_epoch(
      model, sess, mode=mode, compute_loss=False, verbose=True,
    )
  return results_per_batch
Beispiel #6
0
def restore_and_get_results(model, checkpoint, mode):
    saver = tf.train.Saver()
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    if model.hvd:
        sess_config.gpu_options.visible_device_list = str(
            model.hvd.local_rank())
    with tf.Session(config=sess_config) as sess:
        saver.restore(sess, checkpoint)
        results_per_batch = get_results_for_epoch(
            model,
            sess,
            mode=mode,
            compute_loss=False,
            verbose=True,
        )
    return results_per_batch
Beispiel #7
0
  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not self._triggered and step != self._last_step - 1:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      deco_print("Running evaluation on a validation set:")

    results_per_batch, total_loss = get_results_for_epoch(
      self._model, run_context.session, mode="eval", compute_loss=True,
    )

    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      deco_print("Validation loss: {:.4f}".format(total_loss), offset=4)

      dict_to_log = self._model.finalize_evaluation(results_per_batch)
      dict_to_log['eval_loss'] = total_loss

      # saving the best validation model
      if total_loss < self._best_eval_loss:
        self._best_eval_loss = total_loss
        self._eval_saver.save(
          run_context.session,
          os.path.join(self._model.params['logdir'], 'best_models',
                       'val_loss={:.4f}-step'.format(total_loss)),
          global_step=step + 1,
        )

      # optionally logging to tensorboard any values
      # returned from maybe_print_logs
      if dict_to_log:
        log_summaries_from_dict(
          dict_to_log,
          self._model.params['logdir'],
          step,
        )