Ejemplo n.º 1
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    options = make_options_from_flags(FLAGS)

    params = benchmark_cnn.make_params_from_flags()
    params = params._replace(batch_size=options.batch_size)
    params = params._replace(model='MY_GTSRB')
    params = params._replace(num_epochs=options.num_epochs)
    params = params._replace(num_gpus=options.num_gpus)
    params = params._replace(data_format='NHWC')
    params = params._replace(train_dir=options.checkpoint_folder)
    params = params._replace(allow_growth=True)
    params = params._replace(variable_update='replicated')
    params = params._replace(local_parameter_device='gpu')
    params = params._replace(use_tf_layers=False)
    # params = params._replace(all_reduce_spec='nccl')

    # params = params._replace(bottom_file=options.bottom_file)
    # params = params._replace(affine_files=options.affine_files)
    # params = params._replace(affine_classes=options.affine_classes)

    params = params._replace(optimizer=options.optimizer)
    params = params._replace(weight_decay=options.weight_decay)

    #params = params._replace(print_training_accuracy=True)
    params = params._replace(backbone_model_path=options.backbone_model_path)
    # Summary and Save & load checkpoints.
    # params = params._replace(summary_verbosity=1)
    # params = params._replace(save_summaries_steps=10)
    # params = params._replace(save_model_secs=3600)  # save every 1 hour
    params = params._replace(save_model_secs=60)  #save every 5 min
    params = benchmark_cnn.setup(params)

    #testtest(params)
    #exit(0)

    if 'test' in options.data_dir:
        dataset = GTSRBTestDataset(options)
    else:
        dataset = GTSRBDataset(options)
    model = Model_Builder(options.model_name, dataset.num_classes, options,
                          params)

    bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()

    tf.reset_default_graph()
Ejemplo n.º 2
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    with log_context(LOGGER_URL,
                     LOGGER_USRENAME,
                     LOGGER_PASSWORD,
                     LOGGER_DB,
                     LOGGER_SERIES,
                     machine=LOGGER_VM):
        bench.run()
Ejemplo n.º 3
0
def main(positional_arguments):
  # Command-line arguments like '--distortions False' are equivalent to
  # '--distortions=True False', where False is a positional argument. To prevent
  # this from silently running with distortions, we do not allow positional
  # arguments.
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  params = benchmark_cnn.make_params_from_flags()
  params = benchmark_cnn.setup(params)

  import sys
  if params.enable_dmo == True:
    if LoadFileSystem() == False:
        sys.exit(-1)
    else :
        print("\n*******DMO enabled********\n")
  #      sys.exit(0)

  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
Ejemplo n.º 4
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    print('num_inter_threads: ' + str(params.num_inter_threads))
    print('num_intra_threads: ' + str(params.num_intra_threads))
    print('datasets_num_private_threads: ' +
          str(params.datasets_num_private_threads))
    print('datasets_use_prefetch: ' + str(params.datasets_use_prefetch))
    print('datasets_prefetch_buffer_size: ' +
          str(params.datasets_prefetch_buffer_size))

    bench.run()
Ejemplo n.º 5
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()

    # Print ENV Variables
    tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)
    for k, v in os.environ.items():
        tf.logging.debug('{}: {}'.format(k, v))

    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
                              params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

        tfversion = cnn_util.tensorflow_version_tuple()

        log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

        bench.print_info()
        bench.run()
Ejemplo n.º 6
0
def main(_):
  setup()
  bench = BenchmarkCNN()

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
Ejemplo n.º 7
0
def main(_):
    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    with tf.Graph().as_default() as single_gpu_graph:
        bench.build_model()

    def run(sess, num_iters, tensor_or_op_name_to_replica_names, num_workers,
            worker_id, num_replicas_per_worker):
        fetches = {
            'global_step':
            tensor_or_op_name_to_replica_names[bench.global_step.name][0],
            'cost':
            tensor_or_op_name_to_replica_names[bench.cost.name][0],
            'train_op':
            tensor_or_op_name_to_replica_names[bench.train_op.name][0],
        }
        if isinstance(bench.lr, tf.Tensor):
            fetches['lr'] = tensor_or_op_name_to_replica_names[
                bench.lr.name][0]

        start = time.time()
        for i in range(num_iters):
            results = sess.run(fetches)
            if i % FLAGS.log_frequency == 0:
                end = time.time()
                throughput = float(FLAGS.log_frequency) / float(end - start)
                parallax.log.info(
                    "global step: %d, lr: %f, loss: %f, "
                    "throughput: %f steps/sec" %
                    (results['global_step'], results['lr'] if 'lr' in results
                     else bench.lr, results['cost'], throughput))
                start = time.time()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    parallax.parallel_run(single_gpu_graph,
                          run,
                          FLAGS.resource_info_file,
                          FLAGS.max_steps,
                          sync=FLAGS.sync,
                          parallax_config=config)
Ejemplo n.º 9
0
def main(positional_arguments):
  assert len(positional_arguments) >= 1
  if len(positional_arguments) > 1:
    raise ValueError('Received unknown positional arguments: %s'
                     % positional_arguments[1:])

  options = make_options_from_flags(FLAGS)

  params = benchmark_cnn.make_params_from_flags()
  params = params._replace(batch_size=options.batch_size)
  params = params._replace(model='MY_GTSRB')
  params = params._replace(num_epochs=options.num_epochs)
  params = params._replace(num_gpus=options.num_gpus)
  params = params._replace(data_format='NHWC')
  params = params._replace(train_dir=options.checkpoint_folder)
  params = params._replace(allow_growth=True)
  params = params._replace(variable_update='replicated')
  params = params._replace(local_parameter_device='gpu')
  params = params._replace(use_tf_layers=False)
  # params = params._replace(all_reduce_spec='nccl')

  # params = params._replace(bottom_file=options.bottom_file)
  # params = params._replace(affine_files=options.affine_files)
  # params = params._replace(affine_classes=options.affine_classes)

  params = params._replace(optimizer=options.optimizer)
  params = params._replace(weight_decay=options.weight_decay)

  params = params._replace(print_training_accuracy=True)
  params = params._replace(backbone_model_path=options.backbone_model_path)
  # Summary and Save & load checkpoints.
  # params = params._replace(summary_verbosity=1)
  # params = params._replace(save_summaries_steps=10)
  params = params._replace(save_model_secs=3600)  # save every 1 hour
  # params = params._replace(save_model_secs=300) #save every 5 min
  params = benchmark_cnn.setup(params)

  dataset = CifarDataset(options)
  model = Model_Builder(options.model_name, dataset.num_classes, options, params)

  bench = benchmark_cnn.BenchmarkCNN(params, dataset=dataset, model=model)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
Ejemplo n.º 10
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    params = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
def main(_):
    if FLAGS.winograd_nonfused:
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    else:
        os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
    if FLAGS.autotune_threshold:
        os.environ['TF_AUTOTUNE_THRESHOLD'] = str(FLAGS.autotune_threshold)
    os.environ['TF_SYNC_ON_FINISH'] = str(int(FLAGS.sync_on_finish))
    argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    bench = BenchmarkCNN()

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
Ejemplo n.º 12
0
def main(extra_flags):
  # extra_flags is a list of command line arguments, excluding those defined
  # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error
  # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError
  # in that case.
  assert len(extra_flags) >= 1
  if len(extra_flags) > 1:
    raise ValueError('Received unknown flags: %s' % extra_flags[1:])

  params = benchmark_cnn.make_params_from_flags()
  benchmark_cnn.setup(params)
  bench = benchmark_cnn.BenchmarkCNN(params)

  tfversion = cnn_util.tensorflow_version_tuple()
  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

  bench.print_info()
  bench.run()
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.
    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:])

    params = benchmark_cnn.make_params_from_flags()
    with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model):
        params = benchmark_cnn.setup(params)
        bench = benchmark_cnn.BenchmarkCNN(params)

    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn("TensorFlow:  %i.%i" % (tfversion[0], tfversion[1]))

    bench.print_info()
    bench.run()
def main(_):
    # Build benchmark_cnn model
    params = benchmark_cnn.make_params_from_flags()
    params, sess_config = benchmark_cnn.setup(params)
    bench = benchmark_cnn.BenchmarkCNN(params)

    # Print informaton
    tfversion = cnn_util.tensorflow_version_tuple()
    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
    bench.print_info()

    # Build single-GPU benchmark_cnn model
    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():
        bench.build_model()

    config = parallax_config.build_config()
    config.sess_config = sess_config

    sess, num_workers, worker_id, num_replicas_per_worker = \
        parallax.parallel_run(single_gpu_graph,
                              FLAGS.resource_info_file,
                              sync=FLAGS.sync,
                              parallax_config=config)

    fetches = {
        'global_step': bench.global_step,
        'cost': bench.cost,
        'train_op': bench.train_op,
    }

    start = time.time()
    for i in range(FLAGS.max_steps):
        results = sess.run(fetches)
        if (i + 1) % FLAGS.log_frequency == 0:
            end = time.time()
            throughput = float(FLAGS.log_frequency) / float(end - start)
            parallax.log.info(
                "global step: %d, loss: %f, throughput: %f steps/sec" %
                (results['global_step'][0] + 1, results['cost'][0],
                 throughput))
            start = time.time()
Ejemplo n.º 15
0
def train(train_args):
    """
    Train network
    train_args : dict
        Json dict with the user's configuration parameters.
        Can be loaded with json.loads() or with yaml.safe_load()    
    """

    run_results = {
        "status": "ok",
        "user_args": train_args,
        "machine_config": {},
        "training": {},
        "evaluation": {}
    }

    # Remove possible existing model and log files
    for f in os.listdir(cfg.MODELS_DIR):
        file_path = os.path.join(cfg.MODELS_DIR, f)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)

    # Declare training arguments
    kwargs = {
        'model': yaml.safe_load(train_args.model).split(' ')[0],
        'num_gpus': yaml.safe_load(train_args.num_gpus),
        'num_epochs': yaml.safe_load(train_args.num_epochs),
        'batch_size': yaml.safe_load(train_args.batch_size_per_device),
        'optimizer': yaml.safe_load(train_args.optimizer),
        'local_parameter_device': 'cpu',
        'variable_update': 'parameter_server'
    }

    # Locate training data and check if the selected network fits it
    # For real data check whether the right data was mounted to the right place and if not download it (cifar10 only)
    if yaml.safe_load(train_args.dataset) != 'Synthetic data':
        data_name = yaml.safe_load(train_args.dataset)
        if data_name == 'cifar10':
            locate_cifar10()
        if data_name == 'imagenet':
            locate_imagenet()

        kwargs['data_name'] = data_name
        if data_name == 'imagenet_mini':
            locate_imagenet_mini()
            kwargs['data_name'] = 'imagenet'
        verify_selected_model(kwargs['model'], kwargs['data_name'])
        kwargs['data_dir'] = '{}/{}'.format(cfg.DATA_DIR, data_name)
    else:
        verify_selected_model(kwargs['model'], 'imagenet')

    # If no GPU is available or the gpu option is set to 0 run CPU mode
    if num_local_gpus == 0 or kwargs['num_gpus'] == 0:
        kwargs['device'] = 'cpu'
        kwargs['data_format'] = 'NHWC'  # cpu data format
        kwargs[
            'num_gpus'] = 1  # Important: tensorflow uses this also to specify the number of CPUs
    else:
        kwargs['device'] = 'gpu'
        kwargs['data_format'] = 'NCHW'

    # Add training info to run_results but not the directories
    run_results["training"].update(kwargs)
    if run_results["training"]["device"] == "cpu":
        del run_results["training"]["num_gpus"]  # avoid misleading info
    kwargs['train_dir'] = cfg.MODELS_DIR
    kwargs['benchmark_log_dir'] = cfg.MODELS_DIR

    # Setup and run the benchmark model
    params = benchmark.make_params(**kwargs)
    try:
        params = benchmark.setup(params)
        bench = benchmark.BenchmarkCNN(params)
    except ValueError as param_ex:
        raise BadRequest(
            "ValueError in parameter setup: {}. Params: {}".format(
                param_ex, params))

    tf_version = '.'.join(
        [str(x) for x in cnn_util.tensorflow_version_tuple()])
    run_results["training"]["tf_version"] = tf_version

    # Run benchmark and measure total execution time
    bench.print_info()
    start_time_global = datetime.datetime.now().strftime(time_fmt)
    try:
        bench.run()
    except ValueError as ve:
        raise BadRequest('ValueError in benchmark execution: {}'.format(ve))
    end_time_global = datetime.datetime.now().strftime(time_fmt)

    # Read training and metric log files and store training results
    training_file = '{}/training.log'.format(cfg.MODELS_DIR)
    os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR), training_file)
    run_parameters, machine_config = parse_logfile_training(training_file)
    run_results['training'].update(run_parameters)
    run_results["machine_config"] = machine_config

    metric_file = '{}/metric.log'.format(cfg.MODELS_DIR)
    run_results['training']['result'] = {}
    run_results['training']['result']['global_start_time'] = start_time_global
    run_results['training']['result']['global_end_time'] = end_time_global
    start, end, avg_examples = parse_metric_file(metric_file)
    run_results["training"]["result"][
        "average_examples_per_sec"] = avg_examples
    run_results['training']['result']['execution_start_time'] = start
    run_results['training']['result']['execution_end_time'] = end

    ## Evaluation ##
    if yaml.safe_load(train_args.evaluation):
        run_results["evaluation"] = {}

        kwargs_eval = {
            'model': kwargs['model'],
            'num_gpus': kwargs['num_gpus'],
            'device': kwargs['device'],
            'data_format': kwargs['data_format'],
            'benchmark_log_dir': kwargs['benchmark_log_dir'],
            'train_dir': kwargs['train_dir'],
            'eval': True
            # 'eval_dir': cfg.DATA_DIR,
        }
        run_results['evaluation']['device'] = kwargs_eval['device']
        if run_results['evaluation']['device'] == 'gpu':
            run_results['evaluation']['num_gpus'] = kwargs_eval[
                'num_gpus']  # only for GPU to avoid confusion

        # Locate data
        if yaml.safe_load(train_args.dataset) != 'Synthetic data':
            kwargs_eval['data_name'] = kwargs['data_name']
            kwargs_eval['data_dir'] = kwargs['data_dir']

        # Setup and run the evaluation
        params_eval = benchmark.make_params(**kwargs_eval)
        try:
            params_eval = benchmark.setup(params_eval)
            evaluation = benchmark.BenchmarkCNN(params_eval)
        except ValueError as param_ex:
            raise BadRequest("ValueError: {}".format(param_ex))

        evaluation.print_info()
        start_time_global = datetime.datetime.now().strftime(time_fmt)
        evaluation.run()
        end_time_global = datetime.datetime.now().strftime(time_fmt)

        # Read log files and get evaluation results
        os.rename('{}/benchmark_run.log'.format(cfg.MODELS_DIR),
                  '{}/evaluation.log'.format(cfg.MODELS_DIR))
        evaluation_file = '{}/evaluation.log'.format(cfg.MODELS_DIR)
        run_parameters = parse_logfile_evaluation(evaluation_file)
        run_results['evaluation'].update(run_parameters)

        logfile = '{}/metric.log'.format(cfg.MODELS_DIR)
        run_results['evaluation']['result'] = {}
        run_results['evaluation']['result'][
            'global_start_time'] = start_time_global
        run_results['evaluation']['result'][
            'global_end_time'] = end_time_global

        with open(logfile, "r") as f:
            for line in f:
                l = json.loads(line)
                if l["name"] == "eval_average_examples_per_sec":
                    run_results["evaluation"]['result'][
                        "average_examples_per_sec"] = l["value"]
                if l["name"] == "eval_top_1_accuracy":
                    run_results["evaluation"]['result']["top_1_accuracy"] = l[
                        "value"]
                if l["name"] == "eval_top_5_accuracy":
                    run_results["evaluation"]['result']["top_5_accuracy"] = l[
                        "value"]

    return run_results
Ejemplo n.º 16
0
def train(**train_kwargs):
    """
    Train network
    train_args : dict
    """

    print("[DEBUG] train(**train_kwargs) - train_kwargs: %s" %
          (train_kwargs)) if cfg.DEBUG_MODEL else ''

    # use the schema
    schema = cfg.get_train_args_schema()
    # deserialize key-word arguments
    train_args = schema.load(train_kwargs)
    train_keys = train_args.keys()
    # log the dataset name
    # dataset options: ['synthetic_data', 'imagnet_mini', 'imagenet']
    dataset_name = 'synthetic_data'
    if cfg.BENCHMARK_TYPE == 'benchmark':
        if train_args['flavor'] == 'synthetic':
            benchmark_flavor = 'synthetic'
            dataset_name = 'synthetic_data'
        if train_args['flavor'] == 'dataset':
            benchmark_flavor = 'dataset'
            dataset_name = 'imagenet_mini'

    if cfg.BENCHMARK_TYPE == 'pro':
        benchmark_flavor = 'pro'
        dataset_name = train_args['dataset']

    # log the Tensorflow version
    tf_version = '.'.join(
        [str(x) for x in cnn_util.tensorflow_version_tuple()])

    # Declare training arguments for tf_cnn_benchmarks.
    # Defaults are from config.py
    kwargs = {}
    kwargs[
        'num_gpus'] = train_args['num_gpus'] if 'num_gpus' in train_keys else 0
    kwargs['num_epochs'] = (train_args['num_epochs']
                            if 'num_epochs' in train_keys else cfg.NUM_EPOCHS)
    kwargs['optimizer'] = (train_args['optimizer']
                           if 'optimizer' in train_keys else cfg.OPTIMIZER)
    kwargs['use_fp16'] = (train_args['use_fp16']
                          if 'use_fp16' in train_keys else cfg.USE_FP16)
    kwargs['local_parameter_device'] = 'cpu'
    kwargs['variable_update'] = 'parameter_server'
    kwargs['allow_growth'] = True
    kwargs['print_training_accuracy'] = True
    # how often print training info
    kwargs['display_every'] = 10 if kwargs['num_epochs'] < 1.0 else 100

    # If no GPU is available or the gpu option is set to 0, run CPU mode
    if num_local_gpus == 0 or kwargs['num_gpus'] == 0:
        kwargs['device'] = 'cpu'
        kwargs['data_format'] = 'NHWC'  # cpu data format
        kwargs[
            'num_gpus'] = 1  # Important: tensorflow uses this also to specify the number of CPUs
    else:
        kwargs['device'] = 'gpu'
        kwargs['data_format'] = 'NCHW'

    if dataset_name != 'synthetic_data':
        kwargs['data_name'] = (dataset_name if dataset_name != 'imagenet_mini'
                               else 'imagenet')
        kwargs['data_dir'] = os.path.join(cfg.DATA_DIR, dataset_name)

    # Log training info configured for benchmark_cnn in the run_results
    run_results = {
        'machine_config': {},
        'benchmark': {
            'version': get_metadata()['Version'],
            'flavor': benchmark_flavor,
            'docker_base_image': cfg.DOCKER_BASE_IMAGE,
            'dataset': dataset_name,
            'tf_version': tf_version
        },
        'training': {
            'num_gpus': 0,
            'optimizer': '',
            'use_fp16': '',
            'local_parameter_device': '',
            'variable_update': '',
            'allow_growth': '',
            'device': '',
            'data_format': '',
            'models': []
        },
    }

    # Update run_results with values configured for tf_cnn_benchmarks (kwargs)
    results_train_keys = run_results["training"].keys()
    kwargs_keys = kwargs.keys()
    for key in results_train_keys:
        if key in kwargs_keys:
            run_results['training'][key] = kwargs[key]
    # In kwargs num_gpus=1 also for CPU, update num_gpus in run_results to 0
    if run_results["training"]["device"] == "cpu":
        run_results["training"]["num_gpus"] = 0  # avoid misleading info

    # Log information about the machine (CPU, GPU, memory):
    run_results['machine_config'] = mutils.get_machine_config()

    # Let's measure the total time, including download of data
    start_time_global = datetime.datetime.now().strftime(cfg.TIME_FORMAT)

    # Locate training dataset
    # For real data, check whether the data was mounted to the right place
    # and if not, download it (imagenet_mini, cifar10, NOT imagenet!)
    if dataset_name == 'cifar10':
        mutils.locate_cifar10()
    if dataset_name == 'imagenet_mini':
        mutils.locate_imagenet_mini()
    if dataset_name == 'imagenet':
        mutils.locate_imagenet()

    if cfg.BENCHMARK_TYPE == 'pro':
        train_pro.train(train_args, kwargs, run_results)
    else:
        train_sd.train(kwargs, run_results)

    end_time_global = datetime.datetime.now().strftime(cfg.TIME_FORMAT)
    run_results['global_start_time'] = start_time_global
    run_results['global_end_time'] = end_time_global
    end_time_global = mutils.timestr_to_stamp(end_time_global, cfg.TIME_FORMAT)
    start_time_global = mutils.timestr_to_stamp(start_time_global,
                                                cfg.TIME_FORMAT)
    run_results['global_execution_time_sec'] = (end_time_global -
                                                start_time_global)

    return run_results
Ejemplo n.º 17
0
def main(positional_arguments):
    # Command-line arguments like '--distortions False' are equivalent to
    # '--distortions=True False', where False is a positional argument. To prevent
    # this from silently running with distortions, we do not allow positional
    # arguments.

    # For DGX servers use hierarchical_copy=True argument

    assert len(positional_arguments) >= 1
    if len(positional_arguments) > 1:
        raise ValueError('Received unknown positional arguments: %s' %
                         positional_arguments[1:])

    tests_models = [
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'parameter_server',
            'model': 'inception3'
        },
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'parameter_server',
            'model': 'resnet50'
        },
        {
            'num_gpus': None,
            'batch_size': 32,
            'variable_update': 'parameter_server',
            'model': 'resnet152'
        },  #batch=64 crashes
        {
            'num_gpus': None,
            'batch_size': 64,
            'variable_update': 'replicated',
            'model': 'vgg16'
        },
        {
            'num_gpus': None,
            'batch_size': 512,
            'variable_update': 'replicated',
            'model': 'alexnet'
        }
    ]

    test_gpus = [1, 2, 4, 8]

    stats = []
    for test in tests_models:
        for num_gpus in test_gpus:
            test['num_gpus'] = num_gpus

            params = benchmark_cnn.make_params_from_flags()
            params = benchmark_cnn.setup(params)

            # force --hierarchical_copy to False when using 1 GPU
            if num_gpus == 1:
                params = params._replace(hierarchical_copy=False)

            params = params._replace(num_gpus=test['num_gpus'],
                                     batch_size=test['batch_size'],
                                     model=test['model'],
                                     variable_update=test['variable_update'])

            bench = benchmark_cnn.BenchmarkCNN(params)

            tfversion = cnn_util.tensorflow_version_tuple()
            log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

            bench.print_info()
            results = bench.run()
            # result
            # {
            #     'average_wall_time': 0.6646941304206848,
            #     'images_per_sec': 385.1395525908701,
            #     'last_average_loss': 7.256145,
            #     'num_steps': 100,
            #     'num_workers': 1
            # }
            stats.append({'test': test.copy(), 'result': results})

    # summary
    print('summary:')
    print('==========')
    pprint.pprint(stats)

    print('==========')
    s = ''
    for i in range(len(test_gpus)):
        for j in range(len(tests_models)):
            s += str(stats[i + j * len(test_gpus)]['result']['images_per_sec'])
            s += ', '
        s += '\n'
    print(s)
    print('==========')