예제 #1
0
 def testShiftRatio(self):
   test_util.monkey_patch_base_cluster_manager()
   params = benchmark_cnn.make_params(
       data_name='imagenet',
       data_dir=os.path.join(platforms_util.get_test_data_dir(),
                             'fake_tf_record_data'),
       job_name='worker',
       worker_hosts='w1,w2,w3,w4',
       ps_hosts='p1',
       task_index=0)
   self.assertEqual(
       benchmark_cnn.BenchmarkCNN(params).image_preprocessor.shift_ratio, 0.0)
   params = params._replace(task_index=3)
   self.assertEqual(
       benchmark_cnn.BenchmarkCNN(params).image_preprocessor.shift_ratio, 0.75)
예제 #2
0
def main(params):
  """Run benchmarks for TensorFlow."""
  
  print("Run benchmarks for TensorFlow.")
  tf_version = utils.get_tensorflow_version()
  print('TensorFlow:  %i.%i' % (tf_version[0], tf_version[1]))

  params.all_reduce_spec = False
  if params.ip_list:
    ips = params.ip_list.split(',')

    TF_CONFIG = {}
    addresses = []
    port = '5001'

    for ip in ips:
      address = ip + ":" + port
      addresses.append(address)
    
    TF_CONFIG['cluster'] = {'worker' : addresses}
    TF_CONFIG['task'] = {
            'type': 'worker',
            'index': params.job_index,
    }

    os.environ["TF_CONFIG"] = json.dumps(TF_CONFIG)
    params.all_reduce_spec = True

  if params.model in NLP_MODELS:
    bench = benchmark_nlp.BenchmarkNLP(params)
  else:
    bench = benchmark_cnn.BenchmarkCNN(params)
  
  bench.run()
예제 #3
0
def main(positional_arguments):
  # Command-line arguments like '--distortions False' are equivalent to
  # '--distortions=True False', where False is a positional argument. To prevent
  # this from silently running with distortions, we do not allow positional
  # arguments.
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  params = benchmark_cnn.make_params_from_flags()
  params = benchmark_cnn.setup(params)

  import sys
  if params.enable_dmo == True:
    if LoadFileSystem() == False:
        sys.exit(-1)
    else :
        print("\n*******DMO enabled********\n")
  #      sys.exit(0)

  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
예제 #4
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    print('num_inter_threads: ' + str(params.num_inter_threads))
    print('num_intra_threads: ' + str(params.num_intra_threads))
    print('datasets_num_private_threads: ' +
          str(params.datasets_num_private_threads))
    print('datasets_use_prefetch: ' + str(params.datasets_use_prefetch))
    print('datasets_prefetch_buffer_size: ' +
          str(params.datasets_prefetch_buffer_size))

    bench.run()
예제 #5
0
  def testLearningRate(self):
    params = benchmark_cnn.make_params(model='resnet50', batch_size=256)
    self._test_learning_rate(params, {
        0: 0,
        150136: 0.016,
        150137: 0.0016,
        300273: 0.0016,
        300274: 0.00016,
        10000000: 0.0000016
    })

    params = params._replace(init_learning_rate=1.)
    self._test_learning_rate(params, {
        0: 1.,
        10000000: 1.
    })

    params = params._replace(init_learning_rate=1.,
                             num_learning_rate_warmup_epochs=5)
    self._test_learning_rate(params, {
        0: 0.,
        12511: 0.5,
        25022: 1.,
        10000000: 1.
    })

    params = params._replace(
        num_learning_rate_warmup_epochs=0,
        learning_rate_decay_factor=0.5,
        num_epochs_per_decay=2,
        minimum_learning_rate=0.3750,
        batch_size=32)
    self._test_learning_rate(params, {
        0: 1.,
        80071: 1.,
        80072: 0.5,
        160143: 0.5,
        160144: 0.375,
        10000000: 0.375
    })

    params = params._replace(num_epochs_per_decay=0.)
    with self.assertRaises(ValueError):
      with tf.Graph().as_default():
        # This will fail because params.learning_rate_decay_factor cannot be
        # nonzero if params.num_epochs_per_decay is zero.
        benchmark_cnn.BenchmarkCNN(params)._build_model()

    params = benchmark_cnn.make_params(
        model='trivial',
        batch_size=32,
        piecewise_learning_rate_schedule='1;3;.1;5;.01')
    self._test_learning_rate(params, {
        0: 1.,
        120108: 1.,
        120109: 0.1,
        200181: 0.1,
        200182: 0.01,
        100000000: 0.01
    })
예제 #6
0
    def testDistributedReplicatedSavableVars(self):
        test_util.monkey_patch_base_cluster_manager()
        params = benchmark_cnn.make_params(
            variable_update='distributed_replicated',
            model='inception4',
            data_name='imagenet',
            data_dir=os.path.join(platforms_util.get_test_data_dir(),
                                  'fake_tf_record_data'),
            job_name='worker',
            worker_hosts='w1,w2,w3,w4',
            ps_hosts='p1')

        bench = benchmark_cnn.BenchmarkCNN(params)
        with tf.Graph().as_default():
            bench._build_model()
            savable_vars = bench.variable_mgr.savable_variables()
            # Assert all global variables are in savable_vars
            for v in tf.global_variables():
                if not v.name.startswith(
                        variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):
                    self.assertEqual(v.name, 'global_step:0')
                name = bench.variable_mgr._strip_port(v.name)
                if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX):
                    name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX +
                                    '/'):]
                self.assertIn(name, savable_vars)
                self.assertIn(savable_vars[name], tf.global_variables())
            # Assert all local variables on the first tower are in savable_vars
            for v in tf.local_variables():
                if v.name.startswith('v0/'):
                    name = bench.variable_mgr._strip_port(v.name)
                    self.assertIn(name, savable_vars)
예제 #7
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    with log_context(LOGGER_URL,
                     LOGGER_USRENAME,
                     LOGGER_PASSWORD,
                     LOGGER_DB,
                     LOGGER_SERIES,
                     machine=LOGGER_VM):
        bench.run()
예제 #8
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()

    # Print ENV Variables
    tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)
    for k, v in os.environ.items():
        tf.logging.debug('{}: {}'.format(k, v))

    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
                              params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

        tfversion = cnn_util.tensorflow_version_tuple()

        log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

        bench.print_info()
        bench.run()
예제 #9
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    options = make_options_from_flags(FLAGS)

    params = benchmark_cnn.make_params_from_flags()
    params = params._replace(batch_size=options.batch_size)
    params = params._replace(model='MY_GTSRB')
    params = params._replace(num_epochs=options.num_epochs)
    params = params._replace(num_gpus=options.num_gpus)
    params = params._replace(data_format='NHWC')
    params = params._replace(train_dir=options.checkpoint_folder)
    params = params._replace(allow_growth=True)
    params = params._replace(variable_update='replicated')
    params = params._replace(local_parameter_device='gpu')
    params = params._replace(use_tf_layers=False)
    # params = params._replace(all_reduce_spec='nccl')

    # params = params._replace(bottom_file=options.bottom_file)
    # params = params._replace(affine_files=options.affine_files)
    # params = params._replace(affine_classes=options.affine_classes)

    params = params._replace(optimizer=options.optimizer)
    params = params._replace(weight_decay=options.weight_decay)

    #params = params._replace(print_training_accuracy=True)
    params = params._replace(backbone_model_path=options.backbone_model_path)
    # Summary and Save & load checkpoints.
    # params = params._replace(summary_verbosity=1)
    # params = params._replace(save_summaries_steps=10)
    # params = params._replace(save_model_secs=3600)  # save every 1 hour
    params = params._replace(save_model_secs=60)  #save every 5 min
    params = benchmark_cnn.setup(params)

    #testtest(params)
    #exit(0)

    if 'test' in options.data_dir:
        dataset = GTSRBTestDataset(options)
    else:
        dataset = GTSRBDataset(options)
    model = Model_Builder(options.model_name, dataset.num_classes, options,
                          params)

    bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()

    tf.reset_default_graph()
예제 #10
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    handler = benchmark_handler.Handler(params)
    params = handler.params
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params,
                                       dataset=handler.dataset,
                                       model=handler.model)
    handler.set_bench(bench)
    if getattr(bench.input_preprocessor, 'set_aug_list', None):
        bench.input_preprocessor.set_aug_list(params.aug_list)
    bench.benchmark_one_step = handler.benchmark_one_step
    bench.print_eval_results = handler.print_eval_results
    bench.check_early_stop = handler.check_early_stop

    bench.accum_grads = handler.accum_grads
    bench.build_fetches_forward = handler.build_fetches_forward
    if params.memory_saving_method == 'recomputing':
        bench.memory_saving = ms.Memory_Saving(benchmark_cnn=bench)


#    tfversion = util.tensorflow_version_tuple()
#    logging.info('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
 def _test_run_benchmark(self, params):
   """Tests that run_benchmark() runs successfully with the params."""
   logs = []
   with test_util.monkey_patch(all_reduce_benchmark,
                               log_fn=test_util.print_and_add_to_list(logs)):
     bench_cnn = benchmark_cnn.BenchmarkCNN(params)
     all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5)
     self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
예제 #12
0
    def _binary_search_batch_size(self, params, init_batch_size):
        """Find the max batch_size using binary search."""
        assert init_batch_size > 0
        low_batch_size = 0
        high_batch_size = None
        batch_size = init_batch_size

        # No need to run a warmup or many batches; if it doesn't OOM after 10
        # batches, it should work in general.
        params = params._replace(num_batches=10, num_warmup_batches=0)

        # Find high_batch_size first.
        tf.logging.info(
            'Looking for upper bound to batch size, starting with %d' %
            batch_size)
        while high_batch_size is None:
            tf.logging.info('Trying batch_size %d' % batch_size)
            params = params._replace(batch_size=batch_size)
            bench = benchmark_cnn.BenchmarkCNN(params)
            bench.print_info()
            try:
                bench.run()
                low_batch_size = batch_size
                batch_size *= 2
            except tf.errors.ResourceExhaustedError:
                high_batch_size = batch_size - 1

        # Binary Search
        tf.logging.info(
            'Max batch size is in range (%d, %d].  Starting binary search to find '
            'exact max batch size.' % (low_batch_size, batch_size))
        while low_batch_size < high_batch_size:
            batch_size = (low_batch_size + high_batch_size + 1) // 2
            tf.logging.info('Trying batch_size %d' % batch_size)
            params = params._replace(batch_size=batch_size)
            bench = benchmark_cnn.BenchmarkCNN(params)
            bench.print_info()
            try:
                bench.run()
                low_batch_size = batch_size
            except tf.errors.ResourceExhaustedError:
                high_batch_size = batch_size - 1
        self.report_benchmark(extras={'max_batch_size': low_batch_size})
예제 #13
0
def main(_):
    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
예제 #14
0
  def testInvalidFlags(self):
    params = benchmark_cnn.make_params(device='cpu', data_format='NCHW')
    with self.assertRaises(ValueError):
      benchmark_cnn.BenchmarkCNN(params)

    params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True,
                                       variable_update='replicated',
                                       all_reduce_spec='nccl')
    with self.assertRaises(ValueError):
      benchmark_cnn.BenchmarkCNN(params)

    # Automatic loss scaling is only supported for 'replicated', 'ps',
    # and 'independent' variable_updates.
    invalid_variable_updates = [
        'distributed_replicated', 'distributed_all_reduce'
    ]
    for variable_update in invalid_variable_updates:
      params = benchmark_cnn.make_params(
          use_fp16=True,
          fp16_vars=True,
          fp16_enable_auto_loss_scale=True,
          variable_update=variable_update)
      with self.assertRaises(ValueError):
        benchmark_cnn.BenchmarkCNN(params)

    # Automatic loss scaling is not supported for 'nccl'.
    params = benchmark_cnn.make_params(
        use_fp16=True,
        fp16_vars=True,
        fp16_enable_auto_loss_scale=True,
        all_reduce_spec='nccl')
    with self.assertRaises(ValueError):
      benchmark_cnn.BenchmarkCNN(params)

    # Automatic loss scaling is not supported for 'staged_vars'.
    params = benchmark_cnn.make_params(
        use_fp16=True,
        fp16_vars=True,
        fp16_enable_auto_loss_scale=True,
        staged_vars=True)
    with self.assertRaises(ValueError):
      benchmark_cnn.BenchmarkCNN(params)
예제 #15
0
 def _test_learning_rate(self, params, global_step_to_expected_learning_rate):
   bench = benchmark_cnn.BenchmarkCNN(params)
   with tf.Graph().as_default() as graph:
     bench._build_model()
     global_step = graph.get_tensor_by_name('global_step:0')
     learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0')
     with self.test_session(graph=graph, use_gpu=True) as sess:
       items = global_step_to_expected_learning_rate.items()
       for global_step_val, expected_learning_rate in items:
         self.assertAlmostEqual(sess.run(learning_rate,
                                         {global_step: global_step_val}),
                                expected_learning_rate)
def run_with_test_model(params):
  """Runs tf_cnn_benchmarks with a test model."""
  model = test_util.TestCNNModel()
  inputs = test_util.get_fake_var_update_inputs()
  with test_util.monkey_patch(benchmark_cnn,
                              LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
    bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(),
                                       model=model)
    # The test model does not use labels when computing loss, so the label
    # values do not matter as long as it's the right shape.
    labels = np.array([1] * inputs.shape[0])
    bench.image_preprocessor.set_fake_data(inputs, labels)
    bench.run()
def run_with_real_model(params):
  """Runs tf_cnn_benchmarks with a real model."""
  bench = benchmark_cnn.BenchmarkCNN(params)
  bench.print_info()
  preprocessor = get_test_image_preprocessor(bench.batch_size, params)
  if preprocessor is not None:
    # The test image preprocessor requires queue runners. Since this file is
    # used for testing, it is OK to access protected members.
    # pylint: disable=protected-access
    bench.dataset._queue_runner_required = True
    # pylint: enable=protected-access
    bench.image_preprocessor = preprocessor
  bench.run()
예제 #18
0
 def _run_benchmark_cnn_with_fake_images(self, params, images, labels):
     logs = []
     benchmark_cnn.log_fn = _print_and_add_to_list(logs)
     bench = benchmark_cnn.BenchmarkCNN(params)
     bench.image_preprocessor = preprocessing.TestImagePreprocessor(
         227, 227, params.batch_size * params.num_gpus, params.num_gpus,
         benchmark_cnn.get_data_type(params))
     bench.dataset._queue_runner_required = True
     bench.image_preprocessor.set_fake_data(images, labels)
     bench.image_preprocessor.expected_subset = ('validation' if params.eval
                                                 else 'train')
     bench.run()
     return logs
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    with tf.Graph().as_default() as single_gpu_graph:
        bench.build_model()

    def run(sess, num_iters, tensor_or_op_name_to_replica_names, num_workers,
            worker_id, num_replicas_per_worker):
        fetches = {
            'global_step':
            tensor_or_op_name_to_replica_names[bench.global_step.name][0],
            'cost':
            tensor_or_op_name_to_replica_names[bench.cost.name][0],
            'train_op':
            tensor_or_op_name_to_replica_names[bench.train_op.name][0],
        }
        if isinstance(bench.lr, tf.Tensor):
            fetches['lr'] = tensor_or_op_name_to_replica_names[
                bench.lr.name][0]

        start = time.time()
        for i in range(num_iters):
            results = sess.run(fetches)
            if i % FLAGS.log_frequency == 0:
                end = time.time()
                throughput = float(FLAGS.log_frequency) / float(end - start)
                parallax.log.info(
                    "global step: %d, lr: %f, loss: %f, "
                    "throughput: %f steps/sec" %
                    (results['global_step'], results['lr'] if 'lr' in results
                     else bench.lr, results['cost'], throughput))
                start = time.time()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    parallax.parallel_run(single_gpu_graph,
                          run,
                          FLAGS.resource_info_file,
                          FLAGS.max_steps,
                          sync=FLAGS.sync,
                          parallax_config=config)
예제 #20
0
 def testMlPerfCompliance(self):
     string_io = six.StringIO()
     handler = logging.StreamHandler(string_io)
     data_dir = test_util.create_black_and_white_images()
     try:
         mlperf_log.LOGGER.addHandler(handler)
         params = benchmark_cnn.make_params(
             data_dir=data_dir,
             data_name='imagenet',
             batch_size=2,
             num_warmup_batches=0,
             num_batches=2,
             num_eval_batches=3,
             eval_during_training_every_n_steps=1,
             distortions=False,
             weight_decay=0.5,
             optimizer='momentum',
             momentum=0.5,
             stop_at_top_1_accuracy=2.0,
             tf_random_seed=9876,
             ml_perf=True)
         with mlperf.mlperf_logger(use_mlperf_logger=True,
                                   model='resnet50_v1.5'):
             bench_cnn = benchmark_cnn.BenchmarkCNN(
                 params, model=_MlPerfTestModel())
             bench_cnn.run()
         logs = string_io.getvalue().splitlines()
         log_regexes = Counter()
         for log in logs:
             for regex in self.EXPECTED_LOG_REGEXES:
                 if regex.search(log):
                     log_regexes[regex] += 1
         if log_regexes != self.EXPECTED_LOG_REGEXES:
             diff_counter = Counter(log_regexes)
             diff_counter.subtract(self.EXPECTED_LOG_REGEXES)
             differences = []
             for regex in (k for k in diff_counter.keys()
                           if diff_counter[k]):
                 found_count = log_regexes[regex]
                 expected_count = self.EXPECTED_LOG_REGEXES[regex]
                 differences.append(
                     '  For regex %s: Found %d lines matching but '
                     'expected to find %d' %
                     (regex.pattern, found_count, expected_count))
             raise AssertionError(
                 'Logs did not match expected logs. Differences:\n'
                 '%s' % '\n'.join(differences))
     finally:
         mlperf_log.LOGGER.removeHandler(handler)
예제 #21
0
    def testModel(self):
        _check_has_gpu()
        if not self.get_model_name() or not self.model_execution_test():
            return

        params = benchmark_cnn.make_params(model=self.get_model_name(),
                                           num_batches=1,
                                           num_intra_threads=1,
                                           num_inter_threads=12,
                                           batch_size=2,
                                           distortions=False)

        # Run this one; note that this uses a non-test session.
        bench = benchmark_cnn.BenchmarkCNN(params)
        bench.run()
예제 #22
0
def main(positional_arguments):
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  options = make_options_from_flags(FLAGS)

  params = benchmark_cnn.make_params_from_flags()
  params = params._replace(batch_size=options.batch_size)
  params = params._replace(model='MY_GTSRB')
  params = params._replace(num_epochs=options.num_epochs)
  params = params._replace(num_gpus=options.num_gpus)
  params = params._replace(data_format='NHWC')
  params = params._replace(train_dir=options.checkpoint_folder)
  params = params._replace(allow_growth=True)
  params = params._replace(variable_update='replicated')
  params = params._replace(local_parameter_device='gpu')
  params = params._replace(use_tf_layers=False)
  # params = params._replace(all_reduce_spec='nccl')

  # params = params._replace(bottom_file=options.bottom_file)
  # params = params._replace(affine_files=options.affine_files)
  # params = params._replace(affine_classes=options.affine_classes)

  params = params._replace(optimizer=options.optimizer)
  params = params._replace(weight_decay=options.weight_decay)

  params = params._replace(print_training_accuracy=True)
  params = params._replace(backbone_model_path=options.backbone_model_path)
  # Summary and Save & load checkpoints.
  # params = params._replace(summary_verbosity=1)
  # params = params._replace(save_summaries_steps=10)
  params = params._replace(save_model_secs=3600)  # save every 1 hour
  # params = params._replace(save_model_secs=300) #save every 5 min
  params = benchmark_cnn.setup(params)

  dataset = CifarDataset(options)
  model = Model_Builder(options.model_name, dataset.num_classes, options, params)

  bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
예제 #23
0
  def _get_benchmark_cnn_losses(self, inputs, params):
    """Returns the losses of BenchmarkCNN on the given inputs and params."""
    logs = []
    model = test_util.TestCNNModel()
    with test_util.monkey_patch(benchmark_cnn,
                                log_fn=test_util.print_and_add_to_list(logs),
                                LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
      bench = benchmark_cnn.BenchmarkCNN(
          params, dataset=test_util.TestDataSet(), model=model)
      # The test model does not use labels when computing loss, so the label
      # values do not matter as long as it's the right shape.
      labels = np.array([1] * inputs.shape[0])
      bench.image_preprocessor.set_fake_data(inputs, labels)
      bench.run()

    outputs = test_util.get_training_outputs_from_logs(
        logs, params.print_training_accuracy)
    return [x.loss for x in outputs]
예제 #24
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
예제 #25
0
def main(extra_flags):
  # extra_flags is a list of command line arguments, excluding those defined
  # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error
  # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError
  # in that case.
  assert len(extra_flags) >= 1
  if len(extra_flags) > 1:
    raise ValueError('Received unknown flags: %s' % extra_flags[1:])

  params = benchmark_cnn.make_params_from_flags()
  benchmark_cnn.setup(params)
  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn("TensorFlow:  %i.%i" % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():
        bench.build_model()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    sess, num_workers, worker_id, num_replicas_per_worker = \
        parallax.parallel_run(single_gpu_graph,
                              FLAGS.resource_info_file,
                              sync=FLAGS.sync,
                              parallax_config=config)

    fetches = {
        'global_step': bench.global_step,
        'cost': bench.cost,
        'train_op': bench.train_op,
    }

    start = time.time()
    for i in range(FLAGS.max_steps):
        results = sess.run(fetches)
        if (i + 1) % FLAGS.log_frequency == 0:
            end = time.time()
            throughput = float(FLAGS.log_frequency) / float(end - start)
            parallax.log.info(
                "global step: %d, loss: %f, throughput: %f steps/sec" %
                (results['global_step'][0] + 1, results['cost'][0],
                 throughput))
            start = time.time()
예제 #28
0
    def _run_benchmark(self, params):
        """Run a CNN benchmark and report its results.

    Args:
      params: Params tuple, typically created by benchmark_cnn.make_params or
        benchmark_cnn.make_params_from_flags.
    """
        logging.info('Running benchmark [%s]', self._get_name())
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)
        bench.print_info()
        stats = bench.run()
        extras = {}
        extras['examples_per_sec'] = stats.get('images_per_sec')
        if 'last_average_loss' in stats:
            extras['last_average_loss'] = stats['last_average_loss']
        if 'top_1_accuracy' in stats:
            extras['top_1_accuracy'] = stats['top_1_accuracy']
        if 'top_5_accuracy' in stats:
            extras['top_5_accuracy'] = stats['top_5_accuracy']
        self.report_benchmark(iters=stats.get('num_steps'),
                              wall_time=stats.get('average_wall_time'),
                              extras=extras)
예제 #29
0
    def testSaveLoadModel(self):
        _check_has_gpu()
        if not self.get_model_name() or not self.model_save_load_test():
            return

        params = benchmark_cnn.make_params(
            model=self.get_model_name(),
            num_batches=1,
            num_intra_threads=1,
            num_inter_threads=12,
            distortions=False,
            batch_size=2,
            variable_update='replicated',
            num_warmup_batches=0,
            num_gpus=2,
            train_dir=test_util.get_temp_dir('testSaveLoadModel_' +
                                             self.get_model_name()))

        # Run one batch and save the model.
        # Note that this uses a non-test session.
        bench = benchmark_cnn.BenchmarkCNN(params)
        bench.run()
        self.assertEquals(bench.init_global_step, 0)
        # Clear the default graph.
        tf.reset_default_graph()
        # Test if checkpoint had been saved.
        ckpt = tf.train.get_checkpoint_state(params.train_dir)
        match = re.match(
            os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
            ckpt.model_checkpoint_path + '.index')
        self.assertTrue(match)
        self.assertGreaterEqual(int(match.group(1)), params.num_batches)
        params = params._replace(num_batches=2)
        # Reload the model
        bench = benchmark_cnn.BenchmarkCNN(params)
        bench.run()
        # Check if global step has been restored.
        self.assertNotEquals(bench.init_global_step, 0)
        ckpt = tf.train.get_checkpoint_state(params.train_dir)
        match = re.match(
            os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
            ckpt.model_checkpoint_path + '.index')
        self.assertTrue(match)
        self.assertGreaterEqual(int(match.group(1)), params.num_batches)
        # Check that the batch norm moving averages are restored from checkpoints
        with tf.Graph().as_default():
            bench = benchmark_cnn.BenchmarkCNN(params)
            bench._build_model()
            saver = tf.train.Saver(bench.variable_mgr.savable_variables())
            with tf.Session(
                    config=benchmark_cnn.create_config_proto(params)) as sess:
                benchmark_cnn.load_checkpoint(saver, sess, params.train_dir)
                sess.run(bench.variable_mgr.get_post_init_ops())
                bn_moving_vars = [
                    v for v in tf.global_variables()
                    if '/batchnorm' in v.name and '/moving' in v.name
                ]
                self.assertGreater(len(bn_moving_vars), 0)
                for moving_var in bn_moving_vars:
                    moving_var_value = sess.run(moving_var)
                    # Check that the moving means and moving variances have been restored
                    # by asserting they are not their default values of 0 and 1,
                    # respectively
                    if '/moving_mean' in moving_var.name:
                        self.assertFalse(
                            np.array_equal(
                                moving_var_value,
                                np.zeros(moving_var_value.shape,
                                         moving_var_value.dtype)))
                    else:
                        self.assertIn('/moving_variance', moving_var.name)
                        self.assertFalse(
                            np.array_equal(
                                moving_var_value,
                                np.ones(moving_var_value.shape,
                                        moving_var_value.dtype)))
예제 #30
0
 def _run_benchmark_cnn(self, params):
     logs = []
     benchmark_cnn.log_fn = _print_and_add_to_list(logs)
     benchmark_cnn.BenchmarkCNN(params).run()
     return logs