Esempio n. 1
0
    def test_log_multiple_metrics(self):
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        log.log_metric("accuracy",
                       0.999,
                       global_step=1e4,
                       extras={"name": "value"})
        log.log_metric("loss", 0.02, global_step=1e4)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertTrue(tf.gfile.Exists(metric_log))
        with tf.gfile.GFile(metric_log) as f:
            accuracy = json.loads(f.readline())
            self.assertEqual(accuracy["name"], "accuracy")
            self.assertEqual(accuracy["value"], 0.999)
            self.assertEqual(accuracy["unit"], None)
            self.assertEqual(accuracy["global_step"], 1e4)
            self.assertEqual(accuracy["extras"], [{
                "name": "name",
                "value": "value"
            }])

            loss = json.loads(f.readline())
            self.assertEqual(loss["name"], "loss")
            self.assertEqual(loss["value"], 0.02)
            self.assertEqual(loss["unit"], None)
            self.assertEqual(loss["global_step"], 1e4)
            self.assertEqual(loss["extras"], [])
Esempio n. 2
0
    def test_log_non_nubmer_value(self):
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        const = tf.constant(1)
        log.log_metric("accuracy", const)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertFalse(tf.gfile.Exists(metric_log))
Esempio n. 3
0
    def test_log_evaluation_result_with_invalid_type(self):
        eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
        log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        log = logger.BenchmarkLogger(log_dir)
        log.log_estimator_evaluation_result(eval_result)

        metric_log = os.path.join(log_dir, "metric.log")
        self.assertFalse(tf.gfile.Exists(metric_log))
Esempio n. 4
0
    def __init__(self,
                 tensors,
                 log_dir=None,
                 metric_logger=None,
                 every_n_iter=None,
                 every_n_secs=None,
                 at_end=False):
        """Initializer for LoggingMetricHook.

    Args:
      tensors: `dict` that maps string-valued tags to tensors/tensor names,
          or `iterable` of tensors/tensor names.
      log_dir: `string`, directory path that metric hook should write log to.
      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
          hook should use to write the log. Exactly one of the `log_dir` and
          `metric_logger` should be provided.
      every_n_iter: `int`, print the values of `tensors` once every N local
          steps taken on the current worker.
      every_n_secs: `int` or `float`, print the values of `tensors` once every N
          seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
          provided.
      at_end: `bool` specifying whether to print the values of `tensors` at the
          end of the run.

    Raises:
      ValueError:
        1. `every_n_iter` is non-positive, or
        2. Exactly one of every_n_iter and every_n_secs should be provided.
        3. Exactly one of log_dir and metric_logger should be provided.
    """
        super(LoggingMetricHook, self).__init__(tensors=tensors,
                                                every_n_iter=every_n_iter,
                                                every_n_secs=every_n_secs,
                                                at_end=at_end)

        if (log_dir is None) == (metric_logger is None):
            raise ValueError(
                "exactly one of log_dir and metric_logger should be provided.")

        if log_dir is not None:
            self._logger = logger.BenchmarkLogger(log_dir)
        else:
            self._logger = metric_logger
Esempio n. 5
0
  def test_log_evaluation_result(self):
    eval_result = {"loss": 0.46237424,
                   "global_step": 207082,
                   "accuracy": 0.9285}
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
    log = logger.BenchmarkLogger(log_dir)
    log.log_estimator_evaluation_result(eval_result)

    metric_log = os.path.join(log_dir, "metric.log")
    self.assertTrue(tf.gfile.Exists(metric_log))
    with tf.gfile.GFile(metric_log) as f:
      accuracy = json.loads(f.readline())
      self.assertEqual(accuracy["name"], "accuracy")
      self.assertEqual(accuracy["value"], 0.9285)
      self.assertEqual(accuracy["unit"], None)
      self.assertEqual(accuracy["global_step"], 207082)

      loss = json.loads(f.readline())
      self.assertEqual(loss["name"], "loss")
      self.assertEqual(loss["value"], 0.46237424)
      self.assertEqual(loss["unit"], None)
      self.assertEqual(loss["global_step"], 207082)
Esempio n. 6
0
def resnet_main(flags, model_function, input_function, shape=None):
    """Shared main loop for ResNet Models.

  Args:
    flags: FLAGS object that contains the params for running. See
      ResnetArgParser for created flags.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags.export_dir is passed.
  """

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if ALLOW_MULTIPLE_MODELS:
        session_config.gpu_options.allow_growth = True

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_secs=5 * 60,  # Save checkpoints every X minutes.
        keep_checkpoint_max=1000,  # Retain the 1000 most recent checkpoints.
        #tf_random_seed = 5739,         # Set random seed for "reproducible" results
        save_summary_steps=10000,  # Number of steps between summaries
        session_config=session_config)

    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size':
                                            flags.resnet_size,
                                            'data_format':
                                            flags.data_format,
                                            'batch_size':
                                            flags.batch_size,
                                            'multi_gpu':
                                            flags.multi_gpu,
                                            'version':
                                            flags.version,
                                            'ncmmethod':
                                            flags.ncmmethod,
                                            'ncmparam':
                                            flags.ncmparam,
                                            'initial_learning_scale':
                                            flags.initial_learning_scale
                                        })

    if flags.benchmark_log_dir is not None:
        benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
        benchmark_logger.log_run_info("resnet")
    else:
        benchmark_logger = None

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)
        #tensors_to_log = {"iter": "m_iter","deep-cnt": "m_cnt", "deep-sum": "m_sum"}
        #logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)

        print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags.max_train_steps)

        print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 1,
                                  flags.num_parallel_calls, flags.multi_gpu)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps)
        print(eval_results)

        if benchmark_logger:
            benchmark_logger.log_estimator_evaluation_result(eval_results)

        if flags.export_dir is not None:
            # Exports a saved model for the given classifier.
            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
                shape, batch_size=flags.batch_size)
            classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
Esempio n. 7
0
    def test_create_logging_dir(self):
        non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
        self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))

        logger.BenchmarkLogger(non_exist_temp_dir)
        self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
Esempio n. 8
0
def resnet_main(flags, model_function, input_function):
    """Shared main loop for ResNet Models."""

    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    if flags.multi_gpu:
        validate_batch_size_for_multi_gpu(flags.batch_size)

        # There are two steps required if using multi-GPU: (1) wrap the model_fn,
        # and (2) wrap the optimizer. The first happens here, and (2) happens
        # in the model_fn itself when the optimizer is defined.
        model_function = tf.contrib.estimator.replicate_model_fn(
            model_function, loss_reduction=tf.losses.Reduction.MEAN)

    # Create session config based on values of inter_op_parallelism_threads and
    # intra_op_parallelism_threads. Note that we default to having
    # allow_soft_placement = True, which is required for multi-GPU and not
    # harmful for other modes.
    session_config = tf.ConfigProto(
        inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
        intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
        allow_soft_placement=True)

    # Set up a RunConfig to save checkpoint and set session config.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_secs=1e9, session_config=session_config)
    classifier = tf.estimator.Estimator(model_fn=model_function,
                                        model_dir=flags.model_dir,
                                        config=run_config,
                                        params={
                                            'resnet_size': flags.resnet_size,
                                            'data_format': flags.data_format,
                                            'batch_size': flags.batch_size,
                                            'multi_gpu': flags.multi_gpu,
                                            'version': flags.version,
                                        })

    if flags.benchmark_log_dir is not None:
        benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
        benchmark_logger.log_run_info("resnet")
    else:
        benchmark_logger = None

    for _ in range(flags.train_epochs // flags.epochs_between_evals):
        train_hooks = hooks_helper.get_train_hooks(
            flags.hooks,
            batch_size=flags.batch_size,
            benchmark_log_dir=flags.benchmark_log_dir)

        print('Starting a training cycle.')

        def input_fn_train():
            return input_function(True, flags.data_dir, flags.batch_size,
                                  flags.epochs_between_evals,
                                  flags.num_parallel_calls, flags.multi_gpu)

        classifier.train(input_fn=input_fn_train,
                         hooks=train_hooks,
                         max_steps=flags.max_train_steps)

        print('Starting to evaluate.')

        # Evaluate the model and print results
        def input_fn_eval():
            return input_function(False, flags.data_dir, flags.batch_size, 1,
                                  flags.num_parallel_calls, flags.multi_gpu)

        # flags.max_train_steps is generally associated with testing and profiling.
        # As a result it is frequently called with synthetic data, which will
        # iterate forever. Passing steps=flags.max_train_steps allows the eval
        # (which is generally unimportant in those circumstances) to terminate.
        # Note that eval will run for max_train_steps each loop, regardless of the
        # global_step count.
        eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                           steps=flags.max_train_steps)
        print(eval_results)

        if benchmark_logger:
            benchmark_logger.log_estimator_evaluation_result(eval_results)