def Run(self, sess): global_step = sess.run(self._model.global_step) self.SetStatusMessage('Executing decode program at step %d' % global_step) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) dec_metrics = self._task.CreateDecoderMetrics() start_time = time.time() buffered_decode_out = [] for i in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) decode_out = self._task.PostProcessDecodeOut( metrics_values, dec_metrics) tf.logging.info( 'step: %d %f' % (i, dec_metrics['num_samples_in_batch'].total_value)) if decode_out: buffered_decode_out.extend(decode_out) infeed_future.wait() num_examples_metric = dec_metrics['num_samples_in_batch'] summaries = {k: v.Summary(k) for k, v in dec_metrics.items()} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries) decode_out_path = os.path.join(self._program_dir, 'decoder_out_%09d' % global_step) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._task.DecodeFinalize(decode_finalize_args) return False
def Run(self, sess): tf.logging.info('Executing decode program for %s.', self._task_name) gsteps = py_utils.GetGlobalStep() global_step = sess.run(gsteps) if self._ml_perf_log: steps_per_epoch = self._ml_perf.steps_per_epoch epoch = int(global_step) // steps_per_epoch mlp_log.mlperf_print('eval_start', None, metadata={'epoch_num': (epoch + 1)}) infeed_future = self._infeed_pool.apply_async(self._InfeedLoop, args=(sess, )) dec_metrics = self._model_task.CreateDecoderMetrics() start_time = time.time() buffered_decode_out = [] for i in range(self._steps_per_loop): metrics_values = sess.run(self.metrics) decode_out = self._model_task.PostProcessDecodeOut( metrics_values, dec_metrics) tf.logging.info( 'step: %d %f' % (i, dec_metrics['num_samples_in_batch'].total_value)) if decode_out: buffered_decode_out.extend(decode_out) infeed_future.wait() if self._ml_perf_log: mlp_log.mlperf_print('eval_stop', None, metadata={'epoch_num': (epoch + 1)}) num_examples_metric = dec_metrics['num_samples_in_batch'] summaries = {k: v.Summary(k) for k, v in six.iteritems(dec_metrics)} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = tf.Summary(value=[ tf.Summary.Value(tag='examples/sec', simple_value=example_rate) ]) self._WriteSummaries(os.path.basename(self._program_dir), global_step, summaries) decode_out_path = os.path.join(self._program_dir, 'decoder_out_%09d' % global_step) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._model_task.DecodeFinalize(decode_finalize_args) if self._ml_perf_log: mlperf_metric = self._ml_perf.decoder_metric_name mlperf_metric_value = dec_metrics[mlperf_metric].value mlp_log.mlperf_print('eval_accuracy', mlperf_metric_value, metadata={'epoch_num': epoch}) if mlperf_metric_value > self._ml_perf.decoder_metric_success_threshold: tf.logging.info('ml_perf_final_threshold: %f exceeded', self._ml_perf.decoder_metric_success_threshold) mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
def DecodeCheckpoint(self, sess, checkpoint_path): """Decodes `samples_per_summary` examples using `checkpoint_path`.""" p = self._task.params ckpt_id_from_file = self.GetCkptIdFromFile(checkpoint_path) if ckpt_id_from_file < p.eval.start_decoder_after: return False samples_per_summary = p.eval.decoder_samples_per_summary if samples_per_summary is None: samples_per_summary = p.eval.samples_per_summary if samples_per_summary == 0: assert self._task.params.input.resettable self.checkpointer.RestoreFromPath(sess, checkpoint_path) global_step = sess.run(py_utils.GetGlobalStep()) if self._task.params.input.resettable: tf.logging.info('Resetting input_generator.') self._task.input.Reset(sess) dec_metrics = self._task.CreateDecoderMetrics() if not dec_metrics: tf.logging.info('Empty decoder metrics') return buffered_decode_out = [] num_examples_metric = dec_metrics['num_samples_in_batch'] start_time = time.time() while samples_per_summary == 0 or (num_examples_metric.total_value < samples_per_summary): try: tf.logging.info('Fetching dec_output.') fetch_start = time.time() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=False) if self._summary_op is None: # No summaries were collected. dec_out = sess.run(self._dec_output, options=run_options) else: dec_out, summary = sess.run([self._dec_output, self._summary_op], options=run_options) self._summary_writer.add_summary(summary, global_step) post_process_start = time.time() tf.logging.info('Done fetching (%f seconds)' % (post_process_start - fetch_start)) decode_out = self._task.PostProcessDecodeOut(dec_out, dec_metrics) if decode_out: buffered_decode_out.extend(decode_out) tf.logging.info( 'Total examples done: %d/%d ' '(%f seconds decode postprocess)', num_examples_metric.total_value, samples_per_summary, time.time() - post_process_start) except tf.errors.OutOfRangeError: if not self._task.params.input.resettable: raise break tf.logging.info('Done decoding ckpt: %s', checkpoint_path) summaries = {k: v.Summary(k) for k, v in dec_metrics.items()} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = metrics.CreateScalarSummary( 'examples/sec', example_rate) summaries['total_samples'] = metrics.CreateScalarSummary( 'total_samples', num_examples_metric.total_value) self._WriteSummaries( self._summary_writer, os.path.basename(self._decoder_dir), global_step, summaries, text_filename=os.path.join(self._decoder_dir, 'score-{:08d}.txt'.format(global_step))) self._ExportMetrics( # Metrics expects python int, but global_step is numpy.int64. decode_checkpoint=int(global_step), dec_metrics=dec_metrics, example_rate=example_rate) # global_step and the checkpoint id from the checkpoint file might be # different. For consistency of checkpoint filename and decoder_out # file, use the checkpoint id as derived from the checkpoint filename. checkpoint_id = _GetCheckpointIdForDecodeOut(ckpt_id_from_file, global_step) decode_out_path = self.GetDecodeOutPath(self._decoder_dir, checkpoint_id) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._task.DecodeFinalize(decode_finalize_args) should_stop = global_step >= self.params.train.max_steps if self._should_report_metrics: tf.logging.info('Reporting eval measure for step %d.' % global_step) trial_should_stop = self._trial.ReportEvalMeasure(global_step, dec_metrics, checkpoint_path) should_stop = should_stop or trial_should_stop return should_stop
def _DecodeOnce(self, sess=None, path=''): """Decode a single checkpoint.""" with self._cluster: # Attempt to restore the checkpoint self._checkpointer.RestoreFromPath(checkpoint_path=path) global_step = self._model.global_step.numpy() if global_step < self._task.params.eval.start_decoder_after: return if self._task.input.params.resettable: tf.logging.info('Resetting input_generator.') self._task.input_generator.Reset() dec_metrics = self._task.CreateDecoderMetrics() if not dec_metrics: tf.logging.info('Empty decoder metrics') return buffered_decode_out = [] num_samples_metric = dec_metrics['num_samples_in_batch'] samples_per_summary = self._task.params.eval.decoder_samples_per_summary if samples_per_summary is None: samples_per_summary = self._task.params.eval.samples_per_summary if samples_per_summary == 0: assert self._task.input.params.resettable start_time = time.time() while samples_per_summary == 0 or (num_samples_metric.total_value < samples_per_summary): try: tf.logging.info('Fetching dec_output.') fetch_start = time.time() # Decoder calls FProp multiple times for each checkpoint. Multiple # summaries at the same step is often confusing. Instead, models # should generate aggregate summaries using PostProcessDecodeOut. # Other types of summaries (images, audio etc.) will be generated for # the first batch only. is_first_loop = num_samples_metric.total_value == 0 decode_fn = ( self._decode_fn_with_summary if is_first_loop else self._decode_fn) input_batch, dec_output = decode_fn() for key in self._task.input_generator.GetCpuPassthroughKeys(): if key in input_batch: if key in dec_output: tf.logging.warning( f'Key {key} already present in decode output. ' f'Not adding from input batch.') else: dec_output[key] = input_batch[key] dec_output = py_utils.Transform(lambda x: x.numpy(), dec_output) post_process_start = time.time() tf.logging.info('Done fetching (%f seconds)' % (post_process_start - fetch_start)) decode_out = self._task.PostProcessDecodeOut(dec_output, dec_metrics) if decode_out: if isinstance(decode_out, dict): decode_out = decode_out.items() if is_first_loop: # Add summaries only for the first batch of data. with self._summary_writer.as_default(): for key, value in decode_out: if isinstance(value, tf.Summary): tf.logging.info(f'Adding summary {key} with tags ' f'{[x.tag for x in value.value]}.') tf.compat.v2.summary.experimental.write_raw_pb( tf.constant(value.SerializeToString()), global_step) buffered_decode_out.extend( kv for kv in decode_out if not isinstance(kv[1], tf.Summary)) tf.logging.info( 'Total examples done: %d/%d ' '(%f seconds decode postprocess)', num_samples_metric.total_value, samples_per_summary, time.time() - post_process_start) except tf.errors.OutOfRangeError: if not self._task.input.params.resettable: raise break tf.logging.info('Done decoding ckpt: %s', path) elapsed_secs = time.time() - start_time example_rate = num_samples_metric.total_value / elapsed_secs msg = 'step:%6d, elapsed_secs: %0.2f, examples/sec: %0.2f' % ( global_step, elapsed_secs, example_rate) with self._summary_writer.as_default(): tf.compat.v2.summary.scalar( 'decode_secs', elapsed_secs, step=global_step) tf.compat.v2.summary.scalar( 'examples/sec', example_rate, step=global_step) tf.compat.v2.summary.scalar( 'total_samples', num_samples_metric.total_value, step=global_step) for key, metric in sorted(dec_metrics.items()): msg += ' %s:%.8g' % (key, metric.value) tf.compat.v2.summary.scalar(key, metric.value, step=global_step) self._summary_writer.flush() self._SetStatusMessage(msg) self._ExportMetrics( # Metrics expects python int, but global_step is numpy.int64. decode_checkpoint=int(global_step), dec_metrics=dec_metrics, example_rate=example_rate) decode_out_path = self.GetDecodeOutPath(self._decoder_dir, global_step) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._task.DecodeFinalize(decode_finalize_args)
def DecodeCheckpoint(self, sess, checkpoint_path): """Decodes `samples_per_summary` examples using `checkpoint_path`.""" p = self._task.params ckpt_id_from_file = self.GetCkptIdFromFile(checkpoint_path) if ckpt_id_from_file < p.eval.start_decoder_after: return samples_per_summary = p.eval.decoder_samples_per_summary if samples_per_summary is None: samples_per_summary = p.eval.samples_per_summary if samples_per_summary == 0: assert self._task.input.params.resettable self.checkpointer.RestoreFromPath(sess, checkpoint_path) global_step = sess.run(py_utils.GetGlobalStep()) if self._task.input.params.resettable: tf.logging.info('Resetting input_generator.') self._task.input.Reset(sess) dec_metrics = self._task.CreateDecoderMetrics() if not dec_metrics: tf.logging.info('Empty decoder metrics') return buffered_decode_out = [] num_examples_metric = dec_metrics['num_samples_in_batch'] start_time = time.time() while samples_per_summary == 0 or (num_examples_metric.total_value < samples_per_summary): try: is_first_loop = num_examples_metric.total_value == 0 tf.logging.info('Fetching dec_output.') fetch_start = time.time() run_options = tf.RunOptions( report_tensor_allocations_upon_oom=False) # NOTE: We intentionally do not generate scalar summaries by # default, because decoder is run multiple times for each # checkpoint. Multiple summaries at the same step is often confusing. # Instead, models should generate aggregate summaries using # PostProcessDecodeOut. Other types of summaries (images, audio etc.) # will be generated for the first eval batch. if self._summary_op is not None and is_first_loop: dec_out, summaries = sess.run( [self._dec_output, self._summary_op], options=run_options) summaries = self._RemoveScalarSummaries(summaries) # Add non-scalar summaries only for the first batch of data. self._summary_writer.add_summary(summaries, global_step) self._summary_writer.flush() else: dec_out = sess.run(self._dec_output, options=run_options) self._RunTF2SummaryOps(sess) post_process_start = time.time() tf.logging.info('Done fetching (%f seconds)' % (post_process_start - fetch_start)) decode_out = self._task.PostProcessDecodeOut( dec_out, dec_metrics) if decode_out: if isinstance(decode_out, dict): decode_out = decode_out.items() if is_first_loop: # Add summaries only for the first batch of data. for key, value in decode_out: if isinstance(value, tf.Summary): tf.logging.info( f'Adding summary {key} with tags ' f'{[x.tag for x in value.value]}.') self._summary_writer.add_summary( value, global_step) self._summary_writer.flush() buffered_decode_out.extend( kv for kv in decode_out if not isinstance(kv[1], tf.Summary)) tf.logging.info( 'Total examples done: %d/%d ' '(%f seconds decode postprocess)', num_examples_metric.total_value, samples_per_summary, time.time() - post_process_start) except tf.errors.OutOfRangeError: if not self._task.input.params.resettable: raise break tf.logging.info('Done decoding ckpt: %s', checkpoint_path) summaries = {k: v.Summary(k) for k, v in dec_metrics.items()} elapsed_secs = time.time() - start_time example_rate = num_examples_metric.total_value / elapsed_secs summaries['examples/sec'] = metrics.CreateScalarSummary( 'examples/sec', example_rate) summaries['total_samples'] = metrics.CreateScalarSummary( 'total_samples', num_examples_metric.total_value) self._WriteSummaries(self._summary_writer, os.path.basename(self._decoder_dir), global_step, summaries, text_filename=os.path.join( self._decoder_dir, 'score-{:08d}.txt'.format(global_step))) self._ExportMetrics( # Metrics expects python int, but global_step is numpy.int64. decode_checkpoint=int(global_step), dec_metrics=dec_metrics, example_rate=example_rate) # global_step and the checkpoint id from the checkpoint file might be # different. For consistency of checkpoint filename and decoder_out # file, use the checkpoint id as derived from the checkpoint filename. checkpoint_id = _GetCheckpointIdForDecodeOut(ckpt_id_from_file, global_step) decode_out_path = self.GetDecodeOutPath(self._decoder_dir, checkpoint_id) decode_finalize_args = base_model.DecodeFinalizeArgs( decode_out_path=decode_out_path, decode_out=buffered_decode_out) self._task.DecodeFinalize(decode_finalize_args) if self._should_report_metrics: tf.logging.info('Reporting eval measure for step %d.' % global_step) self._trial.ReportEvalMeasure(global_step, dec_metrics, checkpoint_path)