Ejemplo n.º 1
0
    def test_log_multiple_metrics(self):
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        log.log_metric("accuracy",
                       0.999,
                       global_step=1e4,
                       extras={"name": "value"})
        log.log_metric("loss", 0.02, global_step=1e4)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertTrue(tf.gfile.Exists(metric_log))
        with tf.gfile.GFile(metric_log) as f:
            accuracy = json.loads(f.readline())
            self.assertEqual(accuracy["name"], "accuracy")
            self.assertEqual(accuracy["value"], 0.999)
            self.assertEqual(accuracy["unit"], None)
            self.assertEqual(accuracy["global_step"], 1e4)
            self.assertEqual(accuracy["extras"], [{
                "name": "name",
                "value": "value"
            }])

            loss = json.loads(f.readline())
            self.assertEqual(loss["name"], "loss")
            self.assertEqual(loss["value"], 0.02)
            self.assertEqual(loss["unit"], None)
            self.assertEqual(loss["global_step"], 1e4)
            self.assertEqual(loss["extras"], [])
Ejemplo n.º 2
0
    def test_log_evaluation_result_with_invalid_type(self):
        eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        log.log_estimator_evaluation_result(eval_result)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertFalse(tf.gfile.Exists(metric_log))
Ejemplo n.º 3
0
    def test_log_non_nubmer_value(self):
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        const = tf.constant(1)
        log.log_metric("accuracy", const)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertFalse(tf.gfile.Exists(metric_log))
Ejemplo n.º 4
0
    def __init__(self,
                 tensors,
                 log_dir=None,
                 metric_logger=None,
                 every_n_iter=None,
                 every_n_secs=None,
                 at_end=False):
        """Initializer for LoggingMetricHook.

        Args:
          tensors: `dict` that maps string-valued tags to tensors/tensor names,
              or `iterable` of tensors/tensor names.
          log_dir: `string`, directory path that metric hook should write log to.
          metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
              hook should use to write the log. Exactly one of the `log_dir` and
              `metric_logger` should be provided.
          every_n_iter: `int`, print the values of `tensors` once every N local
              steps taken on the current worker.
          every_n_secs: `int` or `float`, print the values of `tensors` once every N
              seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
              provided.
          at_end: `bool` specifying whether to print the values of `tensors` at the
              end of the run.

        Raises:
          ValueError:
            1. `every_n_iter` is non-positive, or
            2. Exactly one of every_n_iter and every_n_secs should be provided.
            3. Exactly one of log_dir and metric_logger should be provided.
        """
        super(LoggingMetricHook, self).__init__(tensors=tensors,
                                                every_n_iter=every_n_iter,
                                                every_n_secs=every_n_secs,
                                                at_end=at_end)

        if (log_dir is None) == (metric_logger is None):
            raise ValueError(
                "exactly one of log_dir and metric_logger should be provided.")

        if log_dir is not None:
            self._logger = logger.BenchmarkLogger(log_dir)
        else:
            self._logger = metric_logger
Ejemplo n.º 5
0
    def test_log_evaluation_result(self):
        eval_result = {
            "loss": 0.46237424,
            "global_step": 207082,
            "accuracy": 0.9285
        }
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        log.log_estimator_evaluation_result(eval_result)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertTrue(tf.gfile.Exists(metric_log))
        with tf.gfile.GFile(metric_log) as f:
            accuracy = json.loads(f.readline())
            self.assertEqual(accuracy["name"], "accuracy")
            self.assertEqual(accuracy["value"], 0.9285)
            self.assertEqual(accuracy["unit"], None)
            self.assertEqual(accuracy["global_step"], 207082)

            loss = json.loads(f.readline())
            self.assertEqual(loss["name"], "loss")
            self.assertEqual(loss["value"], 0.46237424)
            self.assertEqual(loss["unit"], None)
            self.assertEqual(loss["global_step"], 207082)
Ejemplo n.º 6
0
    def test_create_logging_dir(self):
        non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
        self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))

        logger.BenchmarkLogger(non_exist_temp_dir)
        self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
Ejemplo n.º 7
0
def resnet_main(flags, model_function, input_function, shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_function, loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_secs=1e9, session_config=session_config)
    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'version': flags.version,
                                            'loss_scale': flags.loss_scale,
                                            'dtype': flags.dtype
                                        })

    if flags.benchmark_log_dir is not None:
        benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
        benchmark_logger.log_run_info('resnet')
    else:
        benchmark_logger = None

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)

        print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags.max_train_steps)

        print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 1,
                                  flags.num_parallel_calls, flags.multi_gpu)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps)
        print(eval_results)

        if benchmark_logger:
            benchmark_logger.log_estimator_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            break

    if flags.export_dir is not None:
        warn_on_multi_gpu_export(flags.multi_gpu)

        # Exports a saved model for the given classifier.
        input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
            shape, batch_size=flags.batch_size)
        classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
Ejemplo n.º 8
0
def resnet_main(seed, flags, model_function, input_function, shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    mlperf_log.resnet_print(key=mlperf_log.RUN_START)

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if flags.num_gpus == 0:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
    elif flags.num_gpus == 1:
        distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
    else:
        distribution = tf.contrib.distribute.MirroredStrategy(
            num_gpus=flags.num_gpus)

    mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed)
    run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                        save_summary_steps=2000,
                                        save_checkpoints_steps=1000,
                                        session_config=session_config,
                                        tf_random_seed=seed,
                                        keep_checkpoint_max=2)

    mlperf_log.resnet_print(key=mlperf_log.INPUT_BATCH_SIZE,
                            value=flags.batch_size)

    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'final_size': flags.final_size,
                                            'pickle_model': flags.pickle_model,
                                            'random_init': flags.random_init,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'train_epochs': flags.train_epochs,
                                            'version': flags.version,
                                            'version_t': flags.version_t,
                                            'loss_scale': flags.loss_scale,
                                            'gap_train': flags.gap_train,
                                            'gap_lambda': flags.gap_lambda,
                                            'gap_ft': flags.gap_ft,
                                            'gap_start': flags.gap_start,
                                            'dtype': flags.dtype,
                                            'learn_rate': flags.learn_rate,
                                            'label_smoothing':
                                            flags.label_smoothing,
                                            'enable_lars': flags.enable_lars,
                                            'enable_cos': flags.enable_cos,
                                            'cos_alpha': flags.cos_alpha,
                                            'warm_up': flags.warm_up,
                                            'weight_decay': flags.weight_decay,
                                            'fine_tune': flags.fine_tune,
                                            'enable_kd': flags.enable_kd,
                                            'kd_size': flags.kd_size,
                                            'temp_dst': flags.temp_dst,
                                            'w_dst': flags.w_dst,
                                            'mix_up': flags.mix_up,
                                            'mx_mode': flags.mx_mode,
                                            'enable_quantize':
                                            flags.enable_quantize,
                                            'online_quantize':
                                            flags.online_quantize,
                                            'enable_at': flags.enable_at,
                                            'w_at': flags.w_at,
                                        })

    if flags.benchmark_log_dir is not None:
        benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
        benchmark_logger.log_run_info('resnet')
    else:
        benchmark_logger = None

    mlperf_log.resnet_print(key=mlperf_log.TRAIN_LOOP)

    # The reference performs the first evaluation on the fourth epoch. (offset
    # eval by 3 epochs)
    mlperf_log.resnet_print(key=mlperf_log.EVAL_EPOCH_OFFSET, value=3)
    success = False
    print('Training epochs: {}'.format(flags.train_epochs))
    iter_train_epochs = flags.train_epochs
    for i in range(iter_train_epochs // flags.epochs_between_evals):
        # Data for epochs_between_evals (i.e. 4 epochs between evals) worth of
        # epochs is concatenated and run as a single block inside a session. For
        # this reason we declare all of the epochs that will be run at the start.
        # Submitters may report in a way which is reasonable for their control flow.
        for j in range(flags.epochs_between_evals):
            mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH,
                                    value=i * flags.epochs_between_evals + j)

        # input functions
        def input_fn_eval():
            return input_function(is_training=False,
                                  data_dir=flags.data_dir,
                                  batch_size=per_device_batch_size(
                                      flags.batch_size, flags.num_gpus),
                                  num_epochs=1,
                                  dtype=flags.dtype,
                                  oss_load=flags.oss_load)

        def input_fn_train():
            return input_function(is_training=True,
                                  data_dir=flags.data_dir,
                                  batch_size=per_device_batch_size(
                                      flags.batch_size, flags.num_gpus),
                                  num_epochs=flags.epochs_between_evals,
                                  num_gpus=flags.num_gpus,
                                  dtype=flags.dtype,
                                  mix_up=flags.mix_up,
                                  oss_load=flags.oss_load)

        # hooks for training
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)

        _log_cache = []

        def formatter(x):
            """Abuse side effects to get tensors out of the model_fn."""
            if _log_cache:
                _log_cache.pop()
            _log_cache.append(x.copy())
            return str(x)

        compliance_hook = tf.train.LoggingTensorHook(
            tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME},
            every_n_iter=int(1e10),
            at_end=True,
            formatter=formatter)

        extra_hooks = [compliance_hook]
        if flags.enable_quantize:
            if flags.online_quantize:
                # online calculate the KL scale before train-eval
                quant_online_hook = QuantHook(bits=flags.q_bits,
                                              online=True,
                                              quant_mode=flags.q_mode)
                eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                                   steps=1,
                                                   hooks=[quant_online_hook])

            quant_train_hook = QuantHook(bits=flags.q_bits,
                                         quant_copy_num=flags.copy_num,
                                         quant_mode=flags.q_mode)
            extra_hooks.append(quant_train_hook)

        print('Starting a training cycle.')
        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks + extra_hooks,
                         max_steps=flags.max_train_steps)

        train_examples = int(_log_cache.pop()[_NUM_EXAMPLES_NAME])
        mlperf_log.resnet_print(key=mlperf_log.INPUT_SIZE,
                                value=train_examples)

        # Evaluate the model and print results
        print('Starting to evaluate.')
        mlperf_log.resnet_print(key=mlperf_log.EVAL_START)
        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_hooks = None
        if flags.enable_quantize:
            quant_eval_hook = QuantHook(bits=flags.q_bits,
                                        quant_mode=flags.q_mode)
            eval_hooks = [quant_eval_hook]
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps,
                                           hooks=eval_hooks)
        mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP)
        mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE,
                                value=int(eval_results[_NUM_EXAMPLES_NAME]))
        mlperf_log.resnet_print(key=mlperf_log.EVAL_ACCURACY,
                                value=float(eval_results['accuracy']))
        mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET,
                                value=flags.stop_threshold)
        print(eval_results)

        if benchmark_logger:
            benchmark_logger.log_estimator_evaluation_result(eval_results)

        if model_helpers.past_stop_threshold(flags.stop_threshold,
                                             eval_results['accuracy']):
            success = True
            break

    mlperf_log.resnet_print(key=mlperf_log.RUN_STOP,
                            value={"success": success})
    mlperf_log.resnet_print(key=mlperf_log.RUN_FINAL)