Exemple #1
0
def main(_):
    params = benchmark_cnn.make_params_from_flags()
    benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    with log_context(LOGGER_URL,
                     LOGGER_USRENAME,
                     LOGGER_PASSWORD,
                     LOGGER_DB,
                     LOGGER_SERIES,
                     machine=LOGGER_VM):
        bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()

    # Print ENV Variables
    tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)
    for k, v in os.environ.items():
        tf.logging.debug('{}: {}'.format(k, v))

    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
                              params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

        tfversion = cnn_util.tensorflow_version_tuple()

        log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

        bench.print_info()
        bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    print('num_inter_threads: ' + str(params.num_inter_threads))
    print('num_intra_threads: ' + str(params.num_intra_threads))
    print('datasets_num_private_threads: ' +
          str(params.datasets_num_private_threads))
    print('datasets_use_prefetch: ' + str(params.datasets_use_prefetch))
    print('datasets_prefetch_buffer_size: ' +
          str(params.datasets_prefetch_buffer_size))

    bench.run()
def main(positional_arguments):
  # Command-line arguments like '--distortions False' are equivalent to
  # '--distortions=True False', where False is a positional argument. To prevent
  # this from silently running with distortions, we do not allow positional
  # arguments.
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  params = benchmark_cnn.make_params_from_flags()
  params = benchmark_cnn.setup(params)

  import sys
  if params.enable_dmo == True:
    if LoadFileSystem() == False:
        sys.exit(-1)
    else :
        print("\n*******DMO enabled********\n")
  #      sys.exit(0)

  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
Exemple #6
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    handler = benchmark_handler.Handler(params)
    params = handler.params
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params,
                                       dataset=handler.dataset,
                                       model=handler.model)
    handler.set_bench(bench)
    if getattr(bench.input_preprocessor, 'set_aug_list', None):
        bench.input_preprocessor.set_aug_list(params.aug_list)
    bench.benchmark_one_step = handler.benchmark_one_step
    bench.print_eval_results = handler.print_eval_results
    bench.check_early_stop = handler.check_early_stop

    bench.accum_grads = handler.accum_grads
    bench.build_fetches_forward = handler.build_fetches_forward
    if params.memory_saving_method == 'recomputing':
        bench.memory_saving = ms.Memory_Saving(benchmark_cnn=bench)


#    tfversion = util.tensorflow_version_tuple()
#    logging.info('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(_):
  params = benchmark_cnn.make_params_from_flags()
  params = benchmark_cnn.setup(params)
  if params.model == 'test_model':
    run_with_test_model(params)
  else:
    run_with_real_model(params)
Exemple #8
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    options = make_options_from_flags(FLAGS)

    params = benchmark_cnn.make_params_from_flags()
    params = params._replace(batch_size=options.batch_size)
    params = params._replace(model='MY_GTSRB')
    params = params._replace(num_epochs=options.num_epochs)
    params = params._replace(num_gpus=options.num_gpus)
    params = params._replace(data_format='NHWC')
    params = params._replace(train_dir=options.checkpoint_folder)
    params = params._replace(allow_growth=True)
    params = params._replace(variable_update='replicated')
    params = params._replace(local_parameter_device='gpu')
    params = params._replace(use_tf_layers=False)
    # params = params._replace(all_reduce_spec='nccl')

    # params = params._replace(bottom_file=options.bottom_file)
    # params = params._replace(affine_files=options.affine_files)
    # params = params._replace(affine_classes=options.affine_classes)

    params = params._replace(optimizer=options.optimizer)
    params = params._replace(weight_decay=options.weight_decay)

    #params = params._replace(print_training_accuracy=True)
    params = params._replace(backbone_model_path=options.backbone_model_path)
    # Summary and Save & load checkpoints.
    # params = params._replace(summary_verbosity=1)
    # params = params._replace(save_summaries_steps=10)
    # params = params._replace(save_model_secs=3600)  # save every 1 hour
    params = params._replace(save_model_secs=60)  #save every 5 min
    params = benchmark_cnn.setup(params)

    #testtest(params)
    #exit(0)

    if 'test' in options.data_dir:
        dataset = GTSRBTestDataset(options)
    else:
        dataset = GTSRBDataset(options)
    model = Model_Builder(options.model_name, dataset.num_classes, options,
                          params)

    bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()

    tf.reset_default_graph()
def main(extra_flags):
  # extra_flags is a list of command line arguments, excluding those defined
  # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error
  # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError
  # in that case.
  assert len(extra_flags) >= 1
  if len(extra_flags) > 1:
    raise ValueError('Received unknown flags: %s' % extra_flags[1:])

  params = benchmark_cnn.make_params_from_flags()
  benchmark_cnn.setup(params)
  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
Exemple #10
0
def get_data(options, dataset=None, model_name='gtsrb', phase='train'):
  if dataset is None:
    if 'gtsrb' == model_name:
      import train_gtsrb
      if 'test' in options.data_dir:
        dataset = train_gtsrb.GTSRBTestDataset(options)
      else:
        dataset = train_gtsrb.GTSRBDataset(options)
    elif 'resnet101' in model_name:
      import train_megaface
      dataset = train_megaface.MegaFaceDataset(options)
    elif 'resnet50' == model_name:
      import train_imagenet
      dataset = train_imagenet.ImageNetDataset(options)
    elif 'cifar10' in model_name:
      import train_cifar10
      dataset = train_cifar10.CifarDataset(options)

  params = benchmark_cnn.make_params()
  params = params._replace(batch_size=options.batch_size)
  params = params._replace(model='MY_'+model_name)
  params = params._replace(num_epochs=options.num_epochs)
  params = params._replace(num_gpus=options.num_gpus)
  params = params._replace(data_format='NHWC')
  params = params._replace(allow_growth=True)
  params = params._replace(use_tf_layers=False)
  params = params._replace(forward_only=True)
  params = benchmark_cnn.setup(params)

  model = Model_Builder(model_name, dataset.num_classes, options, params)

  is_train = (phase=='train')
  p_class = dataset.get_input_preprocessor()
  preprocessor = p_class(options.batch_size,
                         model.get_input_shapes(phase),
                         options.batch_size,
                         model.data_type,
                         is_train,
                         distortions=params.distortions,
                         resize_method='bilinear')
  ds = preprocessor.create_dataset(batch_size=options.batch_size,
                                   num_splits=1,
                                   batch_size_per_split=options.batch_size,
                                   dataset=dataset,
                                   subset=phase,
                                   train=is_train,
                                   #datasets_repeat_cached_sample = params.datasets_repeat_cached_sample)
                                   datasets_repeat_cached_sample = False)
  ds_iter = preprocessor.create_iterator(ds)
  input_list = ds_iter.get_next()
  return model, dataset, input_list
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    with tf.Graph().as_default() as single_gpu_graph:
        bench.build_model()

    def run(sess, num_iters, tensor_or_op_name_to_replica_names, num_workers,
            worker_id, num_replicas_per_worker):
        fetches = {
            'global_step':
            tensor_or_op_name_to_replica_names[bench.global_step.name][0],
            'cost':
            tensor_or_op_name_to_replica_names[bench.cost.name][0],
            'train_op':
            tensor_or_op_name_to_replica_names[bench.train_op.name][0],
        }
        if isinstance(bench.lr, tf.Tensor):
            fetches['lr'] = tensor_or_op_name_to_replica_names[
                bench.lr.name][0]

        start = time.time()
        for i in range(num_iters):
            results = sess.run(fetches)
            if i % FLAGS.log_frequency == 0:
                end = time.time()
                throughput = float(FLAGS.log_frequency) / float(end - start)
                parallax.log.info(
                    "global step: %d, lr: %f, loss: %f, "
                    "throughput: %f steps/sec" %
                    (results['global_step'], results['lr'] if 'lr' in results
                     else bench.lr, results['cost'], throughput))
                start = time.time()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    parallax.parallel_run(single_gpu_graph,
                          run,
                          FLAGS.resource_info_file,
                          FLAGS.max_steps,
                          sync=FLAGS.sync,
                          parallax_config=config)
Exemple #12
0
def main(positional_arguments):
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  options = make_options_from_flags(FLAGS)

  params = benchmark_cnn.make_params_from_flags()
  params = params._replace(batch_size=options.batch_size)
  params = params._replace(model='MY_GTSRB')
  params = params._replace(num_epochs=options.num_epochs)
  params = params._replace(num_gpus=options.num_gpus)
  params = params._replace(data_format='NHWC')
  params = params._replace(train_dir=options.checkpoint_folder)
  params = params._replace(allow_growth=True)
  params = params._replace(variable_update='replicated')
  params = params._replace(local_parameter_device='gpu')
  params = params._replace(use_tf_layers=False)
  # params = params._replace(all_reduce_spec='nccl')

  # params = params._replace(bottom_file=options.bottom_file)
  # params = params._replace(affine_files=options.affine_files)
  # params = params._replace(affine_classes=options.affine_classes)

  params = params._replace(optimizer=options.optimizer)
  params = params._replace(weight_decay=options.weight_decay)

  params = params._replace(print_training_accuracy=True)
  params = params._replace(backbone_model_path=options.backbone_model_path)
  # Summary and Save & load checkpoints.
  # params = params._replace(summary_verbosity=1)
  # params = params._replace(save_summaries_steps=10)
  params = params._replace(save_model_secs=3600)  # save every 1 hour
  # params = params._replace(save_model_secs=300) #save every 5 min
  params = benchmark_cnn.setup(params)

  dataset = CifarDataset(options)
  model = Model_Builder(options.model_name, dataset.num_classes, options, params)

  bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn("TensorFlow:  %i.%i" % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():
        bench.build_model()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    sess, num_workers, worker_id, num_replicas_per_worker = \
        parallax.parallel_run(single_gpu_graph,
                              FLAGS.resource_info_file,
                              sync=FLAGS.sync,
                              parallax_config=config)

    fetches = {
        'global_step': bench.global_step,
        'cost': bench.cost,
        'train_op': bench.train_op,
    }

    start = time.time()
    for i in range(FLAGS.max_steps):
        results = sess.run(fetches)
        if (i + 1) % FLAGS.log_frequency == 0:
            end = time.time()
            throughput = float(FLAGS.log_frequency) / float(end - start)
            parallax.log.info(
                "global step: %d, loss: %f, throughput: %f steps/sec" %
                (results['global_step'][0] + 1, results['cost'][0],
                 throughput))
            start = time.time()
Exemple #16
0
    def _run_benchmark(self, params):
        """Run a CNN benchmark and report its results.

    Args:
      params: Params tuple, typically created by benchmark_cnn.make_params or
        benchmark_cnn.make_params_from_flags.
    """
        logging.info('Running benchmark [%s]', self._get_name())
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)
        bench.print_info()
        stats = bench.run()
        extras = {}
        extras['examples_per_sec'] = stats.get('images_per_sec')
        if 'last_average_loss' in stats:
            extras['last_average_loss'] = stats['last_average_loss']
        if 'top_1_accuracy' in stats:
            extras['top_1_accuracy'] = stats['top_1_accuracy']
        if 'top_5_accuracy' in stats:
            extras['top_5_accuracy'] = stats['top_5_accuracy']
        self.report_benchmark(iters=stats.get('num_steps'),
                              wall_time=stats.get('average_wall_time'),
                              extras=extras)
Exemple #17
0
 def setUp(self):
     super(VariableUpdateTest, self).setUp()
     _check_has_gpu()
     benchmark_cnn.setup(benchmark_cnn.make_params())
Exemple #18
0
 def setUp(self):
     super(TfCnnBenchmarksModelTest, self).setUp()
     benchmark_cnn.setup(benchmark_cnn.make_params())
Exemple #19
0
 def setUp(self):
     super(TfCnnBenchmarksTest, self).setUp()
     _check_has_gpu()
     benchmark_cnn.setup(benchmark_cnn.make_params())
Exemple #20
0
def main(_):
  FLAGS.eval = True
  params = benchmark_cnn.make_params_from_flags()
  params, config = benchmark_cnn.setup(params)
  bench = benchmark_cnn.BenchmarkCNN(params)
  bench.evaluate()
Exemple #21
0
def train(train_args):
    """
    Train network
    train_args : dict
        Json dict with the user's configuration parameters.
        Can be loaded with json.loads() or with yaml.safe_load()    
    """

    run_results = {
        "status": "ok",
        "user_args": train_args,
        "machine_config": {},
        "training": {},
        "evaluation": {}
    }

    # Remove possible existing model and log files
    for f in os.listdir(cfg.MODELS_DIR):
        file_path = os.path.join(cfg.MODELS_DIR, f)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)

    # Declare training arguments
    kwargs = {
        'model': yaml.safe_load(train_args.model).split(' ')[0],
        'num_gpus': yaml.safe_load(train_args.num_gpus),
        'num_epochs': yaml.safe_load(train_args.num_epochs),
        'batch_size': yaml.safe_load(train_args.batch_size_per_device),
        'optimizer': yaml.safe_load(train_args.optimizer),
        'local_parameter_device': 'cpu',
        'variable_update': 'parameter_server'
    }

    # Locate training data and check if the selected network fits it
    # For real data check whether the right data was mounted to the right place and if not download it (cifar10 only)
    if yaml.safe_load(train_args.dataset) != 'Synthetic data':
        data_name = yaml.safe_load(train_args.dataset)
        if data_name == 'cifar10':
            locate_cifar10()
        if data_name == 'imagenet':
            locate_imagenet()

        kwargs['data_name'] = data_name
        if data_name == 'imagenet_mini':
            locate_imagenet_mini()
            kwargs['data_name'] = 'imagenet'
        verify_selected_model(kwargs['model'], kwargs['data_name'])
        kwargs['data_dir'] = '{}/{}'.format(cfg.DATA_DIR, data_name)
    else:
        verify_selected_model(kwargs['model'], 'imagenet')

    # If no GPU is available or the gpu option is set to 0 run CPU mode
    if num_local_gpus == 0 or kwargs['num_gpus'] == 0:
        kwargs['device'] = 'cpu'
        kwargs['data_format'] = 'NHWC'  # cpu data format
        kwargs[
            'num_gpus'] = 1  # Important: tensorflow uses this also to specify the number of CPUs
    else:
        kwargs['device'] = 'gpu'
        kwargs['data_format'] = 'NCHW'

    # Add training info to run_results but not the directories
    run_results["training"].update(kwargs)
    if run_results["training"]["device"] == "cpu":
        del run_results["training"]["num_gpus"]  # avoid misleading info
    kwargs['train_dir'] = cfg.MODELS_DIR
    kwargs['benchmark_log_dir'] = cfg.MODELS_DIR

    # Setup and run the benchmark model
    params = benchmark.make_params(**kwargs)
    try:
        params = benchmark.setup(params)
        bench = benchmark.BenchmarkCNN(params)
    except ValueError as param_ex:
        raise BadRequest(
            "ValueError in parameter setup: {}. Params: {}".format(
                param_ex, params))

    tf_version = '.'.join(
        [str(x) for x in cnn_util.tensorflow_version_tuple()])
    run_results["training"]["tf_version"] = tf_version

    # Run benchmark and measure total execution time
    bench.print_info()
    start_time_global = datetime.datetime.now().strftime(time_fmt)
    try:
        bench.run()
    except ValueError as ve:
        raise BadRequest('ValueError in benchmark execution: {}'.format(ve))
    end_time_global = datetime.datetime.now().strftime(time_fmt)

    # Read training and metric log files and store training results
    training_file = '{}/training.log'.format(cfg.MODELS_DIR)
    os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR), training_file)
    run_parameters, machine_config = parse_logfile_training(training_file)
    run_results['training'].update(run_parameters)
    run_results["machine_config"] = machine_config

    metric_file = '{}/metric.log'.format(cfg.MODELS_DIR)
    run_results['training']['result'] = {}
    run_results['training']['result']['global_start_time'] = start_time_global
    run_results['training']['result']['global_end_time'] = end_time_global
    start, end, avg_examples = parse_metric_file(metric_file)
    run_results["training"]["result"][
        "average_examples_per_sec"] = avg_examples
    run_results['training']['result']['execution_start_time'] = start
    run_results['training']['result']['execution_end_time'] = end

    ## Evaluation ##
    if yaml.safe_load(train_args.evaluation):
        run_results["evaluation"] = {}

        kwargs_eval = {
            'model': kwargs['model'],
            'num_gpus': kwargs['num_gpus'],
            'device': kwargs['device'],
            'data_format': kwargs['data_format'],
            'benchmark_log_dir': kwargs['benchmark_log_dir'],
            'train_dir': kwargs['train_dir'],
            'eval': True
            # 'eval_dir': cfg.DATA_DIR,
        }
        run_results['evaluation']['device'] = kwargs_eval['device']
        if run_results['evaluation']['device'] == 'gpu':
            run_results['evaluation']['num_gpus'] = kwargs_eval[
                'num_gpus']  # only for GPU to avoid confusion

        # Locate data
        if yaml.safe_load(train_args.dataset) != 'Synthetic data':
            kwargs_eval['data_name'] = kwargs['data_name']
            kwargs_eval['data_dir'] = kwargs['data_dir']

        # Setup and run the evaluation
        params_eval = benchmark.make_params(**kwargs_eval)
        try:
            params_eval = benchmark.setup(params_eval)
            evaluation = benchmark.BenchmarkCNN(params_eval)
        except ValueError as param_ex:
            raise BadRequest("ValueError: {}".format(param_ex))

        evaluation.print_info()
        start_time_global = datetime.datetime.now().strftime(time_fmt)
        evaluation.run()
        end_time_global = datetime.datetime.now().strftime(time_fmt)

        # Read log files and get evaluation results
        os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR),
                  '{}/evaluation.log'.format(cfg.MODELS_DIR))
        evaluation_file = '{}/evaluation.log'.format(cfg.MODELS_DIR)
        run_parameters = parse_logfile_evaluation(evaluation_file)
        run_results['evaluation'].update(run_parameters)

        logfile = '{}/metric.log'.format(cfg.MODELS_DIR)
        run_results['evaluation']['result'] = {}
        run_results['evaluation']['result'][
            'global_start_time'] = start_time_global
        run_results['evaluation']['result'][
            'global_end_time'] = end_time_global

        with open(logfile, "r") as f:
            for line in f:
                l = json.loads(line)
                if l["name"] == "eval_average_examples_per_sec":
                    run_results["evaluation"]['result'][
                        "average_examples_per_sec"] = l["value"]
                if l["name"] == "eval_top_1_accuracy":
                    run_results["evaluation"]['result']["top_1_accuracy"] = l[
                        "value"]
                if l["name"] == "eval_top_5_accuracy":
                    run_results["evaluation"]['result']["top_5_accuracy"] = l[
                        "value"]

    return run_results
Exemple #22
0
def train(train_args, kwargs, run_results):
    """Function for training and evalution used in the "pro" flavor
    Example of run_results, fields filled by this function:
    
    {
      "machine_config": {}, # filled in deep_api.py
      "benchmark": {}, # filled in deep_api.py
      "training": {
        "allow_growth": true,
        "batch_size": 64,
        "batch_size_per_device": 64,
        "data_format": "NCHW",
        "device": "gpu",
        "local_parameter_device": "cpu",
        "model": "resnet50",
        "num_batches": 100,
        "num_epochs": 0,
        "num_gpus": 1,
        "optimizer": "sgd",
        "use_fp16": false,
        "variable_update": "parameter_server",
        "weight_decay": 0.00004,
        "result": {
          "average_examples_per_sec": 124.41983172966508,
          "execution_start_time": "2021-02-10T22:59:17.434987Z",
          "execution_end_time": "2021-02-10T23:00:08.358017Z",
          "execution_time_sec": 50.92302989959717
        }
      },
      "evaluation": {
        "batch_size": 64,
        "batch_size_per_device": 64,
        "data_format": "NCHW",
        "device": "gpu",
        "model": "resnet50",
        "num_batches": 100,
        "num_gpus": 1,
        "result": {
          "average_examples_per_sec": 401.17907755615994,
          "top_1_accuracy": 0.0015625,
          "top_5_accuracy": 0.00609375
        }
      },
      ...
    }        
    """

    # Add more training arguments
    kwargs['batch_size'] = train_args['batch_size_per_device']
    kwargs['model'] = train_args['model'].split(' ')[0]
    kwargs['weight_decay'] = train_args['weight_decay']

    # Log additional arguments in run_results[]
    run_results['training']['models'].append(kwargs['model'])
    run_results["training"]['num_epochs'] = kwargs['num_epochs']
    run_results['training']['weight_decay'] = kwargs['weight_decay']

    # Check if the selected network fits the dataset
    dataset_name = (kwargs['data_name']
                    if 'data_name' in kwargs.keys() else 'synthetic_data')
    if dataset_name != 'synthetic_data':
        mutils.verify_selected_model(kwargs['model'], kwargs['data_name'])
    else:
        mutils.verify_selected_model(kwargs['model'], 'imagenet')

    # Create Train_Run_Dir to store training data
    Train_Run_Dir, _ = mutils.create_train_run_dir(kwargs)
    kwargs['train_dir'] = Train_Run_Dir
    kwargs['benchmark_log_dir'] = Train_Run_Dir

    # Log training directories, if they are not deleted later
    if not train_args['if_cleanup']:
        run_results['training']['train_dir'] = kwargs['train_dir']
        run_results['training']['benchmark_log_dir'] = kwargs[
            'benchmark_log_dir']

    # Setup and run the benchmark model
    print("[DEBUG] benchmark kwargs: %s" % (kwargs)) if cfg.DEBUG_MODEL else ''
    params = benchmark.make_params(**kwargs)
    try:
        params = benchmark.setup(params)
        bench = benchmark.BenchmarkCNN(params)
    except ValueError as param_ex:
        raise BadRequest(
            "ValueError in parameter setup: {}. Params: {}".format(
                param_ex, params))

    # Run benchmark for Training
    bench.print_info()
    try:
        bench.run()
    except ValueError as ve:
        raise BadRequest('ValueError in benchmark execution: {}'.format(ve))

    # Read training and metric log files and store training results
    training_file = os.path.join(Train_Run_Dir, 'training.log')
    os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'), training_file)
    run_parameters = mutils.parse_logfile_training(training_file)
    run_results['training'].update(run_parameters)

    # sort the dictionary alphabetically
    run_results['training'] = OrderedDict(
        sorted(run_results['training'].items(), key=lambda t: t[0]))

    metric_file = os.path.join(Train_Run_Dir, 'metric.log')
    # it seems, in the case of synthetic_data we need a delay to close metric.log
    mutils.wait_final_read(metric_file, "average_examples_per_sec")
    start, end, avg_examples = mutils.parse_metric_file(metric_file)
    run_results['training']['result'] = {}
    run_results["training"]["result"][
        "average_examples_per_sec"] = avg_examples
    run_results['training']['result']['execution_start_time'] = start
    run_results['training']['result']['execution_end_time'] = end
    start_sec = mutils.timestr_to_stamp(start, cfg.TIME_FORMAT)
    end_sec = mutils.timestr_to_stamp(end, cfg.TIME_FORMAT)
    run_results['training']['result'][
        'execution_time_sec'] = end_sec - start_sec

    ## Evaluation ##
    if train_args['evaluation']:
        run_results["evaluation"] = {}

        kwargs_eval = {
            'model': kwargs['model'],
            'num_gpus': kwargs['num_gpus'],
            'device': kwargs['device'],
            'data_format': kwargs['data_format'],
            'benchmark_log_dir': kwargs['benchmark_log_dir'],
            'train_dir': kwargs['train_dir'],
            'eval': True
            # 'eval_dir': Eval_Dir,
        }

        if kwargs_eval['device'] == 'cpu':
            kwargs_eval['batch_size'] = cfg.BATCH_SIZE_CPU

        run_results['evaluation']['device'] = kwargs_eval['device']
        if run_results['evaluation']['device'] == 'gpu':
            run_results['evaluation']['num_gpus'] = kwargs_eval[
                'num_gpus']  # only for GPU to avoid confusion

        # Locate data
        if dataset_name != 'synthetic_data':
            kwargs_eval['data_name'] = kwargs['data_name']
            kwargs_eval['data_dir'] = kwargs['data_dir']

        # Setup and run the evaluation
        params_eval = benchmark.make_params(**kwargs_eval)
        try:
            params_eval = benchmark.setup(params_eval)
            evaluation = benchmark.BenchmarkCNN(params_eval)
        except ValueError as param_ex:
            raise BadRequest("ValueError: {}".format(param_ex))

        evaluation.print_info()
        evaluation.run()

        # Read log files and get evaluation results
        evaluation_file = os.path.join(Train_Run_Dir, 'evaluation.log')
        os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'),
                  evaluation_file)
        run_parameters = mutils.parse_logfile_evaluation(evaluation_file)
        run_results['evaluation'].update(run_parameters)

        # sort the dictionary alphabetically
        run_results['evaluation'] = OrderedDict(
            sorted(run_results['evaluation'].items(), key=lambda t: t[0]))

        logfile = os.path.join(Train_Run_Dir, 'metric.log')
        run_results['evaluation']['result'] = {}

        # it seems, in the case of synthetic_data we need a delay to close evaluation.log
        mutils.wait_final_read(logfile, "eval_average_examples_per_sec")

        with open(logfile, "r") as f:
            for line in f:
                l = json.loads(line)
                if l["name"] == "eval_average_examples_per_sec":
                    run_results["evaluation"]['result'][
                        "average_examples_per_sec"] = l["value"]
                if l["name"] == "eval_top_1_accuracy":
                    run_results["evaluation"]['result']["top_1_accuracy"] = l[
                        "value"]
                if l["name"] == "eval_top_5_accuracy":
                    run_results["evaluation"]['result']["top_5_accuracy"] = l[
                        "value"]

    if train_args['if_cleanup']:
        shutil.rmtree(Train_Run_Dir)
Exemple #23
0
 def setUp(self):
     super(MlPerfComplianceTest, self).setUp()
     benchmark_cnn.setup(benchmark_cnn.make_params())
Exemple #24
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.

    # For DGX servers use hierarchical_copy=True argument

    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    tests_models = [
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'parameter_server',
            'model': 'inception3'
        },
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'parameter_server',
            'model': 'resnet50'
        },
        {
            'num_gpus': None,
            'batch_size': 32,
            'variable_update': 'parameter_server',
            'model': 'resnet152'
        },  #batch=64 crashes
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'replicated',
            'model': 'vgg16'
        },
        {
            'num_gpus': None,
            'batch_size': 512,
            'variable_update': 'replicated',
            'model': 'alexnet'
        }
    ]

    test_gpus = [1, 2, 4, 8]

    stats = []
    for test in tests_models:
        for num_gpus in test_gpus:
            test['num_gpus'] = num_gpus

            params = benchmark_cnn.make_params_from_flags()
            params = benchmark_cnn.setup(params)

            # force --hierarchical_copy to False when using 1 GPU
            if num_gpus == 1:
                params = params._replace(hierarchical_copy=False)

            params = params._replace(num_gpus=test['num_gpus'],
                                     batch_size=test['batch_size'],
                                     model=test['model'],
                                     variable_update=test['variable_update'])

            bench = benchmark_cnn.BenchmarkCNN(params)

            tfversion = cnn_util.tensorflow_version_tuple()
            log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

            bench.print_info()
            results = bench.run()
            # result
            # {
            #     'average_wall_time': 0.6646941304206848,
            #     'images_per_sec': 385.1395525908701,
            #     'last_average_loss': 7.256145,
            #     'num_steps': 100,
            #     'num_workers': 1
            # }
            stats.append({'test': test.copy(), 'result': results})

    # summary
    print('summary:')
    print('==========')
    pprint.pprint(stats)

    print('==========')
    s = ''
    for i in range(len(test_gpus)):
        for j in range(len(tests_models)):
            s += str(stats[i + j * len(test_gpus)]['result']['images_per_sec'])
            s += ', '
        s += '\n'
    print(s)
    print('==========')
Exemple #25
0
def train(kwargs, run_results):
    """Function to perform training in the case of 
    'synthetic'/'dataset' flavor.
    Updates run_results{}
    """

    cnn_score = 0.
    # sort the dictionary alphabetically
    run_results['training'] = OrderedDict(
        sorted(run_results['training'].items(), key=lambda t: t[0]))

    # calculate "GPU memory scale" for the batch_size
    num_local_gpus, gpu_model, gpu_memory = mutils.get_available_gpus()
    m4gb = 4000000000.
    if kwargs['device'] == 'gpu':
        quotient = gpu_memory // m4gb
        remainder = gpu_memory % m4gb
        rest = remainder / m4gb

        if rest > 0.4 and rest <= 0.75:
            memory_scale = quotient + 0.5
        elif rest > 0.75:
            memory_scale = quotient + 1
        else:
            memory_scale = quotient
    else:
        memory_scale = 1.

    print("[DEBUG] GPU Memory scale = {}".format(memory_scale))
    # Setup and run the benchmark model
    for model, batch_size in cfg.MODELS.items():
        print()
        print("[INFO] Testing {} model ...".format(model))

        kwargs['model'] = model
        # in the case of CPU, use batch_size = 8
        if kwargs['device'] == 'gpu':
            kwargs['batch_size'] = int(batch_size * memory_scale)
        else:
            kwargs['batch_size'] = cfg.BATCH_SIZE_CPU

        # Check if the selected network fits the dataset
        if 'data_name' in kwargs.keys():
            if kwargs['data_name'] != 'synthetic_data':
                mutils.verify_selected_model(kwargs['model'],
                                             kwargs['data_name'])
        else:
            mutils.verify_selected_model(kwargs['model'], 'imagenet')

        # Create Train_Run_Dir to store training data.
        # In the 'benchmark' case, we do not log directory names
        Train_Run_Dir, _ = mutils.create_train_run_dir(kwargs)
        kwargs['train_dir'] = Train_Run_Dir
        kwargs['benchmark_log_dir'] = Train_Run_Dir

        print("[DEBUG] benchmark kwargs: %s" %
              (kwargs)) if cfg.DEBUG_MODEL else ''
        params = benchmark.make_params(**kwargs)
        try:
            params = benchmark.setup(params)
            bench = benchmark.BenchmarkCNN(params)
        except ValueError as param_ex:
            raise BadRequest(
                "ValueError in parameter setup: {}. Params: {}".format(
                    param_ex, params))

        # Run benchmark and measure total execution time
        bench.print_info()

        try:
            bench.run()
        except ValueError as ve:
            raise BadRequest(
                'ValueError in benchmark execution: {}'.format(ve))

        # Read training and metric log files and store training results
        training_file = os.path.join(Train_Run_Dir, 'training.log')
        os.rename(os.path.join(Train_Run_Dir, 'benchmark_run.log'),
                  training_file)
        run_parameters = mutils.parse_logfile_training(training_file)

        metric_file = os.path.join(Train_Run_Dir, 'metric.log')
        # it seems, in the case of synthetic_data we need a delay to close metric.log
        mutils.wait_final_read(metric_file, "average_examples_per_sec")
        run_results['training']['models'].append(kwargs['model'])
        run_results['training'][model] = {}
        run_results['training'][model].update(run_parameters)
        run_results['training'][model]['num_epochs'] = kwargs['num_epochs']
        start, end, avg_examples = mutils.parse_metric_file(metric_file)
        print(start, end, avg_examples)
        cnn_score += avg_examples
        start = mutils.timestr_to_stamp(start, cfg.TIME_FORMAT)
        end = mutils.timestr_to_stamp(end, cfg.TIME_FORMAT)
        run_results["training"][model][
            "average_examples_per_sec"] = avg_examples
        run_results['training'][model]['execution_time_sec'] = end - start

        # if_cleanup = true: delete training directory
        if cfg.IF_CLEANUP:
            shutil.rmtree(Train_Run_Dir)

    run_results['training']['score'] = cnn_score