Ejemplo n.º 1
0
    def Run(self, sess):
        global_step = sess.run(self._model.global_step)
        self.SetStatusMessage('Executing decode program at step %d' %
                              global_step)
        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        dec_metrics = self._task.CreateDecoderMetrics()
        start_time = time.time()
        buffered_decode_out = []
        for i in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            decode_out = self._task.PostProcessDecodeOut(
                metrics_values, dec_metrics)
            tf.logging.info(
                'step: %d %f' %
                (i, dec_metrics['num_samples_in_batch'].total_value))
            if decode_out:
                buffered_decode_out.extend(decode_out)
        infeed_future.wait()

        num_examples_metric = dec_metrics['num_samples_in_batch']
        summaries = {k: v.Summary(k) for k, v in dec_metrics.items()}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)
        decode_out_path = os.path.join(self._program_dir,
                                       'decoder_out_%09d' % global_step)
        decode_finalize_args = base_model.DecodeFinalizeArgs(
            decode_out_path=decode_out_path, decode_out=buffered_decode_out)
        self._task.DecodeFinalize(decode_finalize_args)
        return False
Ejemplo n.º 2
0
    def Run(self, sess):
        tf.logging.info('Executing decode program for %s.', self._task_name)
        gsteps = py_utils.GetGlobalStep()
        global_step = sess.run(gsteps)

        if self._ml_perf_log:
            steps_per_epoch = self._ml_perf.steps_per_epoch
            epoch = int(global_step) // steps_per_epoch
            mlp_log.mlperf_print('eval_start',
                                 None,
                                 metadata={'epoch_num': (epoch + 1)})

        infeed_future = self._infeed_pool.apply_async(self._InfeedLoop,
                                                      args=(sess, ))
        dec_metrics = self._model_task.CreateDecoderMetrics()
        start_time = time.time()
        buffered_decode_out = []
        for i in range(self._steps_per_loop):
            metrics_values = sess.run(self.metrics)
            decode_out = self._model_task.PostProcessDecodeOut(
                metrics_values, dec_metrics)
            tf.logging.info(
                'step: %d %f' %
                (i, dec_metrics['num_samples_in_batch'].total_value))
            if decode_out:
                buffered_decode_out.extend(decode_out)
        infeed_future.wait()

        if self._ml_perf_log:
            mlp_log.mlperf_print('eval_stop',
                                 None,
                                 metadata={'epoch_num': (epoch + 1)})

        num_examples_metric = dec_metrics['num_samples_in_batch']
        summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = tf.Summary(value=[
            tf.Summary.Value(tag='examples/sec', simple_value=example_rate)
        ])
        self._WriteSummaries(os.path.basename(self._program_dir), global_step,
                             summaries)
        decode_out_path = os.path.join(self._program_dir,
                                       'decoder_out_%09d' % global_step)
        decode_finalize_args = base_model.DecodeFinalizeArgs(
            decode_out_path=decode_out_path, decode_out=buffered_decode_out)
        self._model_task.DecodeFinalize(decode_finalize_args)

        if self._ml_perf_log:
            mlperf_metric = self._ml_perf.decoder_metric_name
            mlperf_metric_value = dec_metrics[mlperf_metric].value
            mlp_log.mlperf_print('eval_accuracy',
                                 mlperf_metric_value,
                                 metadata={'epoch_num': epoch})
            if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold:
                tf.logging.info('ml_perf_final_threshold: %f exceeded',
                                self._ml_perf.decoder_metric_success_threshold)
                mlp_log.mlperf_print('run_stop',
                                     None,
                                     metadata={'status': 'success'})
Ejemplo n.º 3
0
  def DecodeCheckpoint(self, sess, checkpoint_path):
    """Decodes `samples_per_summary` examples using `checkpoint_path`."""
    p = self._task.params
    ckpt_id_from_file = self.GetCkptIdFromFile(checkpoint_path)
    if ckpt_id_from_file < p.eval.start_decoder_after:
      return False
    samples_per_summary = p.eval.decoder_samples_per_summary
    if samples_per_summary is None:
      samples_per_summary = p.eval.samples_per_summary
    if samples_per_summary == 0:
      assert self._task.params.input.resettable
    self.checkpointer.RestoreFromPath(sess, checkpoint_path)

    global_step = sess.run(py_utils.GetGlobalStep())

    if self._task.params.input.resettable:
      tf.logging.info('Resetting input_generator.')
      self._task.input.Reset(sess)

    dec_metrics = self._task.CreateDecoderMetrics()
    if not dec_metrics:
      tf.logging.info('Empty decoder metrics')
      return
    buffered_decode_out = []
    num_examples_metric = dec_metrics['num_samples_in_batch']
    start_time = time.time()
    while samples_per_summary == 0 or (num_examples_metric.total_value <
                                       samples_per_summary):
      try:
        tf.logging.info('Fetching dec_output.')
        fetch_start = time.time()
        run_options = tf.RunOptions(report_tensor_allocations_upon_oom=False)
        if self._summary_op is None:
          # No summaries were collected.
          dec_out = sess.run(self._dec_output, options=run_options)
        else:
          dec_out, summary = sess.run([self._dec_output, self._summary_op],
                                      options=run_options)
          self._summary_writer.add_summary(summary, global_step)
        post_process_start = time.time()
        tf.logging.info('Done fetching (%f seconds)' %
                        (post_process_start - fetch_start))
        decode_out = self._task.PostProcessDecodeOut(dec_out, dec_metrics)
        if decode_out:
          buffered_decode_out.extend(decode_out)
        tf.logging.info(
            'Total examples done: %d/%d '
            '(%f seconds decode postprocess)', num_examples_metric.total_value,
            samples_per_summary,
            time.time() - post_process_start)
      except tf.errors.OutOfRangeError:
        if not self._task.params.input.resettable:
          raise
        break
    tf.logging.info('Done decoding ckpt: %s', checkpoint_path)

    summaries = {k: v.Summary(k) for k, v in dec_metrics.items()}
    elapsed_secs = time.time() - start_time
    example_rate = num_examples_metric.total_value / elapsed_secs
    summaries['examples/sec'] = metrics.CreateScalarSummary(
        'examples/sec', example_rate)
    summaries['total_samples'] = metrics.CreateScalarSummary(
        'total_samples', num_examples_metric.total_value)
    self._WriteSummaries(
        self._summary_writer,
        os.path.basename(self._decoder_dir),
        global_step,
        summaries,
        text_filename=os.path.join(self._decoder_dir,
                                   'score-{:08d}.txt'.format(global_step)))
    self._ExportMetrics(
        # Metrics expects python int, but global_step is numpy.int64.
        decode_checkpoint=int(global_step),
        dec_metrics=dec_metrics,
        example_rate=example_rate)
    # global_step and the checkpoint id from the checkpoint file might be
    # different. For consistency of checkpoint filename and decoder_out
    # file, use the checkpoint id as derived from the checkpoint filename.
    checkpoint_id = _GetCheckpointIdForDecodeOut(ckpt_id_from_file, global_step)
    decode_out_path = self.GetDecodeOutPath(self._decoder_dir, checkpoint_id)

    decode_finalize_args = base_model.DecodeFinalizeArgs(
        decode_out_path=decode_out_path, decode_out=buffered_decode_out)
    self._task.DecodeFinalize(decode_finalize_args)

    should_stop = global_step >= self.params.train.max_steps
    if self._should_report_metrics:
      tf.logging.info('Reporting eval measure for step %d.' % global_step)
      trial_should_stop = self._trial.ReportEvalMeasure(global_step,
                                                        dec_metrics,
                                                        checkpoint_path)
      should_stop = should_stop or trial_should_stop
    return should_stop
Ejemplo n.º 4
0
  def _DecodeOnce(self, sess=None, path=''):
    """Decode a single checkpoint."""
    with self._cluster:
      # Attempt to restore the checkpoint
      self._checkpointer.RestoreFromPath(checkpoint_path=path)

      global_step = self._model.global_step.numpy()
      if global_step < self._task.params.eval.start_decoder_after:
        return

      if self._task.input.params.resettable:
        tf.logging.info('Resetting input_generator.')
        self._task.input_generator.Reset()

      dec_metrics = self._task.CreateDecoderMetrics()
      if not dec_metrics:
        tf.logging.info('Empty decoder metrics')
        return
      buffered_decode_out = []
      num_samples_metric = dec_metrics['num_samples_in_batch']

      samples_per_summary = self._task.params.eval.decoder_samples_per_summary
      if samples_per_summary is None:
        samples_per_summary = self._task.params.eval.samples_per_summary
      if samples_per_summary == 0:
        assert self._task.input.params.resettable

      start_time = time.time()
      while samples_per_summary == 0 or (num_samples_metric.total_value <
                                         samples_per_summary):
        try:
          tf.logging.info('Fetching dec_output.')
          fetch_start = time.time()
          # Decoder calls FProp multiple times for each checkpoint. Multiple
          # summaries at the same step is often confusing.  Instead, models
          # should generate aggregate summaries using PostProcessDecodeOut.
          # Other types of summaries (images, audio etc.) will be generated for
          # the first batch only.
          is_first_loop = num_samples_metric.total_value == 0
          decode_fn = (
              self._decode_fn_with_summary
              if is_first_loop else self._decode_fn)
          input_batch, dec_output = decode_fn()

          for key in self._task.input_generator.GetCpuPassthroughKeys():
            if key in input_batch:
              if key in dec_output:
                tf.logging.warning(
                    f'Key {key} already present in decode output. '
                    f'Not adding from input batch.')
              else:
                dec_output[key] = input_batch[key]

          dec_output = py_utils.Transform(lambda x: x.numpy(), dec_output)

          post_process_start = time.time()
          tf.logging.info('Done fetching (%f seconds)' %
                          (post_process_start - fetch_start))
          decode_out = self._task.PostProcessDecodeOut(dec_output, dec_metrics)

          if decode_out:
            if isinstance(decode_out, dict):
              decode_out = decode_out.items()

            if is_first_loop:
              # Add summaries only for the first batch of data.
              with self._summary_writer.as_default():
                for key, value in decode_out:
                  if isinstance(value, tf.Summary):
                    tf.logging.info(f'Adding summary {key} with tags '
                                    f'{[x.tag for x in value.value]}.')
                    tf.compat.v2.summary.experimental.write_raw_pb(
                        tf.constant(value.SerializeToString()), global_step)

            buffered_decode_out.extend(
                kv for kv in decode_out if not isinstance(kv[1], tf.Summary))

          tf.logging.info(
              'Total examples done: %d/%d '
              '(%f seconds decode postprocess)', num_samples_metric.total_value,
              samples_per_summary,
              time.time() - post_process_start)

        except tf.errors.OutOfRangeError:
          if not self._task.input.params.resettable:
            raise
          break

      tf.logging.info('Done decoding ckpt: %s', path)

      elapsed_secs = time.time() - start_time
      example_rate = num_samples_metric.total_value / elapsed_secs
      msg = 'step:%6d, elapsed_secs: %0.2f, examples/sec: %0.2f' % (
          global_step, elapsed_secs, example_rate)
      with self._summary_writer.as_default():
        tf.compat.v2.summary.scalar(
            'decode_secs', elapsed_secs, step=global_step)
        tf.compat.v2.summary.scalar(
            'examples/sec', example_rate, step=global_step)
        tf.compat.v2.summary.scalar(
            'total_samples', num_samples_metric.total_value, step=global_step)
        for key, metric in sorted(dec_metrics.items()):
          msg += ' %s:%.8g' % (key, metric.value)
          tf.compat.v2.summary.scalar(key, metric.value, step=global_step)
        self._summary_writer.flush()
      self._SetStatusMessage(msg)

      self._ExportMetrics(
          # Metrics expects python int, but global_step is numpy.int64.
          decode_checkpoint=int(global_step),
          dec_metrics=dec_metrics,
          example_rate=example_rate)

      decode_out_path = self.GetDecodeOutPath(self._decoder_dir, global_step)
      decode_finalize_args = base_model.DecodeFinalizeArgs(
          decode_out_path=decode_out_path, decode_out=buffered_decode_out)
      self._task.DecodeFinalize(decode_finalize_args)
Ejemplo n.º 5
0
    def DecodeCheckpoint(self, sess, checkpoint_path):
        """Decodes `samples_per_summary` examples using `checkpoint_path`."""
        p = self._task.params
        ckpt_id_from_file = self.GetCkptIdFromFile(checkpoint_path)
        if ckpt_id_from_file < p.eval.start_decoder_after:
            return

        samples_per_summary = p.eval.decoder_samples_per_summary
        if samples_per_summary is None:
            samples_per_summary = p.eval.samples_per_summary
        if samples_per_summary == 0:
            assert self._task.input.params.resettable
        self.checkpointer.RestoreFromPath(sess, checkpoint_path)

        global_step = sess.run(py_utils.GetGlobalStep())

        if self._task.input.params.resettable:
            tf.logging.info('Resetting input_generator.')
            self._task.input.Reset(sess)

        dec_metrics = self._task.CreateDecoderMetrics()
        if not dec_metrics:
            tf.logging.info('Empty decoder metrics')
            return
        buffered_decode_out = []
        num_examples_metric = dec_metrics['num_samples_in_batch']
        start_time = time.time()
        while samples_per_summary == 0 or (num_examples_metric.total_value <
                                           samples_per_summary):
            try:
                is_first_loop = num_examples_metric.total_value == 0
                tf.logging.info('Fetching dec_output.')
                fetch_start = time.time()
                run_options = tf.RunOptions(
                    report_tensor_allocations_upon_oom=False)

                # NOTE: We intentionally do not generate scalar summaries by
                # default, because decoder is run  multiple times for each
                # checkpoint. Multiple summaries at the same step is often confusing.
                # Instead, models should generate aggregate summaries using
                # PostProcessDecodeOut. Other types of summaries (images, audio etc.)
                # will be generated for the first eval batch.
                if self._summary_op is not None and is_first_loop:
                    dec_out, summaries = sess.run(
                        [self._dec_output, self._summary_op],
                        options=run_options)
                    summaries = self._RemoveScalarSummaries(summaries)

                    # Add non-scalar summaries only for the first batch of data.
                    self._summary_writer.add_summary(summaries, global_step)
                    self._summary_writer.flush()
                else:
                    dec_out = sess.run(self._dec_output, options=run_options)

                self._RunTF2SummaryOps(sess)
                post_process_start = time.time()
                tf.logging.info('Done fetching (%f seconds)' %
                                (post_process_start - fetch_start))
                decode_out = self._task.PostProcessDecodeOut(
                    dec_out, dec_metrics)
                if decode_out:
                    if isinstance(decode_out, dict):
                        decode_out = decode_out.items()

                    if is_first_loop:
                        # Add summaries only for the first batch of data.
                        for key, value in decode_out:
                            if isinstance(value, tf.Summary):
                                tf.logging.info(
                                    f'Adding summary {key} with tags '
                                    f'{[x.tag for x in value.value]}.')
                                self._summary_writer.add_summary(
                                    value, global_step)
                        self._summary_writer.flush()

                    buffered_decode_out.extend(
                        kv for kv in decode_out
                        if not isinstance(kv[1], tf.Summary))
                tf.logging.info(
                    'Total examples done: %d/%d '
                    '(%f seconds decode postprocess)',
                    num_examples_metric.total_value, samples_per_summary,
                    time.time() - post_process_start)
            except tf.errors.OutOfRangeError:
                if not self._task.input.params.resettable:
                    raise
                break
        tf.logging.info('Done decoding ckpt: %s', checkpoint_path)

        summaries = {k: v.Summary(k) for k, v in dec_metrics.items()}
        elapsed_secs = time.time() - start_time
        example_rate = num_examples_metric.total_value / elapsed_secs
        summaries['examples/sec'] = metrics.CreateScalarSummary(
            'examples/sec', example_rate)
        summaries['total_samples'] = metrics.CreateScalarSummary(
            'total_samples', num_examples_metric.total_value)
        self._WriteSummaries(self._summary_writer,
                             os.path.basename(self._decoder_dir),
                             global_step,
                             summaries,
                             text_filename=os.path.join(
                                 self._decoder_dir,
                                 'score-{:08d}.txt'.format(global_step)))
        self._ExportMetrics(
            # Metrics expects python int, but global_step is numpy.int64.
            decode_checkpoint=int(global_step),
            dec_metrics=dec_metrics,
            example_rate=example_rate)
        # global_step and the checkpoint id from the checkpoint file might be
        # different. For consistency of checkpoint filename and decoder_out
        # file, use the checkpoint id as derived from the checkpoint filename.
        checkpoint_id = _GetCheckpointIdForDecodeOut(ckpt_id_from_file,
                                                     global_step)
        decode_out_path = self.GetDecodeOutPath(self._decoder_dir,
                                                checkpoint_id)

        decode_finalize_args = base_model.DecodeFinalizeArgs(
            decode_out_path=decode_out_path, decode_out=buffered_decode_out)
        self._task.DecodeFinalize(decode_finalize_args)

        if self._should_report_metrics:
            tf.logging.info('Reporting eval measure for step %d.' %
                            global_step)
            self._trial.ReportEvalMeasure(global_step, dec_metrics,
                                          checkpoint_path)