def Run(benchmark_spec):
  """Run MXNet on the cluster for each model specified.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  mx_benchmark_dir = 'incubator-mxnet/example/image-classification'
  results = []
  for model in FLAGS.mx_models:
    num_layers = _GetNumLayers(model)
    batch_size = _GetBatchSize(model, num_layers)
    benchmark_spec.model = model
    benchmark_spec.batch_size = batch_size
    benchmark_spec.num_layers = num_layers
    benchmark_spec.image_shape = _GetImageShape(model)
    mx_benchmark_cmd = (
        'python train_imagenet.py '
        '--benchmark=1 '
        '--network={network} '
        '--batch-size={batch_size} '
        '--image-shape={image_shape} '
        '--num-epochs={num_epochs} '
        '--dtype={precision} '
        '--kv-store={key_value_store}').format(
            network=model,
            batch_size=batch_size,
            image_shape=benchmark_spec.image_shape,
            num_epochs=benchmark_spec.num_epochs,
            precision=benchmark_spec.precision,
            key_value_store=benchmark_spec.key_value_store)
    if benchmark_spec.device == GPU:
      num_gpus = cuda_toolkit.QueryNumberOfGpus(vm)
      mx_benchmark_cmd = '{env} {cmd} --gpus {gpus}'.format(
          env=mxnet.GetEnvironmentVars(vm),
          cmd=mx_benchmark_cmd,
          gpus=','.join(str(n) for n in range(num_gpus)))
    elif benchmark_spec.device == CPU:
      # Specifies the number of threads to use in CPU test.
      # https://mxnet.incubator.apache.org/faq/perf.html
      mx_benchmark_cmd = 'OMP_NUM_THREADS={omp_num_threads} {cmd}'.format(
          omp_num_threads=vm.NumCpusForBenchmark() // 2,
          cmd=mx_benchmark_cmd)

    if num_layers:
      mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd, num_layers)
    run_command = 'cd %s && %s' % (mx_benchmark_dir,
                                   mx_benchmark_cmd)
    stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True)

    results.append(_MakeSamplesFromOutput(benchmark_spec, stdout or stderr))

  return results
Exemple #2
0
def _RunModelOnVm(vm, model, benchmark_spec, args='', job_name=''):
    """Runs a TensorFlow benchmark on a single VM.

  Args:
    vm: VM to run on
    model: string, the name of model to run
    benchmark_spec: BenchmarkSpec object
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    a Sample containing the TensorFlow throughput or the process identification
    number from TensorFlow parameter server.
  """
    tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks'
    batch_size = _GetBatchSize(model)
    tf_cnn_benchmark_cmd = (
        'python tf_cnn_benchmarks.py '
        '--local_parameter_device={local_parameter_device} '
        '--batch_size={batch_size} '
        '--model={model} '
        '--data_name={data_name} '
        '--variable_update={variable_update} '
        '--distortions={distortions} '
        '--device={device} '
        '--data_format={data_format} '
        '--forward_only={forward_only} '
        '--flush_stdout=true'.format(
            local_parameter_device=benchmark_spec.local_parameter_device,
            batch_size=batch_size,
            model=model,
            data_name=benchmark_spec.data_name,
            variable_update=benchmark_spec.variable_update,
            distortions=benchmark_spec.distortions,
            device=benchmark_spec.device,
            data_format=benchmark_spec.data_format,
            forward_only=benchmark_spec.forward_only))
    if benchmark_spec.device == GPU:
        num_gpus = cuda_toolkit.QueryNumberOfGpus(vm)
        tf_cnn_benchmark_cmd = '{env} {cmd} --num_gpus={gpus}'.format(
            env=tensorflow.GetEnvironmentVars(vm),
            cmd=tf_cnn_benchmark_cmd,
            gpus=num_gpus)
    else:
        num_gpus = 0
    if args:
        tf_cnn_benchmark_cmd = '{cmd} --job_name={job} {args}'.format(
            cmd=tf_cnn_benchmark_cmd, job=job_name, args=args)
    run_command = 'cd {path} ; {cmd}'.format(path=tf_cnn_benchmark_dir,
                                             cmd=tf_cnn_benchmark_cmd)
    output, _ = vm.RobustRemoteCommand(run_command, should_log=True)
    if job_name == 'ps':
        return _ExtractTfParameterServerPid(output)
    else:
        return _MakeSamplesFromOutput(benchmark_spec, output, model,
                                      batch_size, num_gpus)
def Prepare(benchmark_spec):
  """Install SHOC and push the machinefile.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """
  vm_util.RunThreaded(_InstallAndAuthenticateVm, benchmark_spec.vms)

  master_vm = benchmark_spec.vms[0]
  benchmark_spec.num_gpus = cuda_toolkit.QueryNumberOfGpus(master_vm)
  hpc_util.CreateMachineFile(benchmark_spec.vms,
                             lambda _: benchmark_spec.num_gpus,
                             MACHINEFILE)
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    gpus_per_node = cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0])
    num_vms = len(benchmark_spec.vms)
    total_gpus = gpus_per_node * num_vms

    benchmark_spec.gpus_per_node = gpus_per_node
    benchmark_spec.num_vms = num_vms
    benchmark_spec.total_gpus = total_gpus
    benchmark_spec.model = FLAGS.horovod_model
    benchmark_spec.batch_size = FLAGS.horovod_batch_size
    benchmark_spec.deep_learning_examples_commit = (
        FLAGS.horovod_deep_learning_examples_commit)
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    gpus_per_node = (FLAGS.hpcg_gpus_per_node
                     or cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0]))
    cpus_per_rank = int(benchmark_spec.vms[0].num_cpus / gpus_per_node)
    num_vms = len(benchmark_spec.vms)
    total_gpus = gpus_per_node * num_vms

    benchmark_spec.gpus_per_node = gpus_per_node
    benchmark_spec.cpus_per_rank = cpus_per_rank
    benchmark_spec.num_vms = num_vms
    benchmark_spec.total_gpus = total_gpus
    benchmark_spec.hpcg_problem_size = FLAGS.hpcg_problem_size
    benchmark_spec.hpcg_runtime = FLAGS.hpcg_runtime
    benchmark_spec.run_as_root = FLAGS.mpirun_allow_run_as_root
Exemple #6
0
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
  """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
  gpus_per_node = cuda_toolkit.QueryNumberOfGpus(benchmark_spec.vms[0])
  num_vms = len(benchmark_spec.vms)
  total_gpus = gpus_per_node * num_vms

  benchmark_spec.gpus_per_node = gpus_per_node
  benchmark_spec.num_vms = num_vms
  benchmark_spec.total_gpus = total_gpus
  benchmark_spec.model = FLAGS.horovod_model
  benchmark_spec.batch_size = FLAGS.horovod_batch_size
  benchmark_spec.num_epochs = FLAGS.horovod_num_epochs
  benchmark_spec.precision = FLAGS.horovod_precision
  benchmark_spec.max_seq_len = int(FLAGS.horovod_max_seq_len)
  benchmark_spec.bert_finetune = FLAGS.horovod_bert_finetune
  benchmark_spec.timeline = FLAGS.horovod_timelime
  benchmark_spec.nccl_net_plugin = FLAGS.nccl_net_plugin
  benchmark_spec.cuda_visible_devices = FLAGS.horovod_cuda_visible_devices
def Run(benchmark_spec):
    """Run MXNet on the cluster for each model specified.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    mx_benchmark_dir = 'incubator-mxnet/example/image-classification'
    results = []
    for model in FLAGS.mx_models:
        num_layers = _GetNumLayers(model)
        batch_size = _GetBatchSize(model, num_layers)
        benchmark_spec.model = model
        benchmark_spec.batch_size = batch_size
        benchmark_spec.num_layers = num_layers
        mx_benchmark_cmd = (
            'python train_imagenet.py --benchmark 1 --network %s --batch-size %s '
            '--image-shape %s --num-epochs %s --kv-store device') % (
                model, batch_size, IMAGENET_SHAPE, benchmark_spec.num_epochs)
        if benchmark_spec.device == GPU:
            gpus = cuda_toolkit.QueryNumberOfGpus(vm)
            mx_benchmark_cmd = '%s %s --gpus %s' % (mxnet.GetEnvironmentVars(
                vm), mx_benchmark_cmd, ','.join(str(n) for n in range(gpus)))
        if num_layers:
            mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd,
                                                       num_layers)
        run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd)
        stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True)

        results.append(_MakeSamplesFromOutput(benchmark_spec, stdout
                                              or stderr))

    return results
def _GetTfCnnBenchmarkCommand(vm,
                              model,
                              batch_size,
                              benchmark_spec,
                              args='',
                              job_name=''):
    """Create the command used to run the tf_cnn_benchmarks script.

  The command is either formulated using flag values stored on the
  benchmark_spec, or is essentially provided outright through the
  benchmark_args flag.

  Args:
    vm: the VM to run on.
    model: name of the model to run.
    batch_size: batch size to use for training.
    benchmark_spec: the benchmark spec object.
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    A string that runs the tf_cnn_benchmarks.py script
    with the desired arguments.
  """
    num_gpus = (cuda_toolkit.QueryNumberOfGpus(vm)
                if cuda_toolkit.CheckNvidiaGpuExists(vm) else 0)

    if benchmark_spec.benchmark_args is not None:
        cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args
        # If the user didn't specify num_gpus in the benchmark_args string,
        # use all the GPUs on the system.
        if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus:
            cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd,
                                                       num_gpus=num_gpus)
        return cmd

    benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device
    benchmark_spec.device = FLAGS.tf_device
    benchmark_spec.data_format = FLAGS.tf_data_format
    if num_gpus == 0:
        benchmark_spec.local_parameter_device = CPU
        benchmark_spec.device = CPU
        benchmark_spec.data_format = NHWC

    cmd = ('python tf_cnn_benchmarks.py '
           '--local_parameter_device={local_parameter_device} '
           '--batch_size={batch_size} '
           '--model={model} '
           '--data_name={data_name} '
           '--variable_update={variable_update} '
           '--distortions={distortions} '
           '--device={device} '
           '--data_format={data_format} '
           '--forward_only={forward_only} '
           '--use_fp16={use_fp16}'.format(
               local_parameter_device=benchmark_spec.local_parameter_device,
               batch_size=batch_size,
               model=model,
               data_name=benchmark_spec.data_name,
               variable_update=benchmark_spec.variable_update,
               distortions=benchmark_spec.distortions,
               device=benchmark_spec.device,
               data_format=benchmark_spec.data_format,
               forward_only=benchmark_spec.forward_only,
               use_fp16=(benchmark_spec.precision == FP16)))
    if benchmark_spec.device == GPU:
        cmd = '{env} {cmd} --num_gpus={gpus}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=cmd, gpus=num_gpus)
    if args:
        cmd = '{cmd} --job_name={job} {args}'.format(cmd=cmd,
                                                     job=job_name,
                                                     args=args)
    return cmd
def Run(benchmark_spec):
    """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        resnet_benchmark_script = 'resnet_main.py'
        resnet_benchmark_cmd = (
            '{env_cmd} && '
            'cd tpu/models && '
            'export PYTHONPATH=$(pwd) &&'
            'cd official/resnet && '
            'python {script} '
            '--use_tpu={use_tpu} '
            '--data_dir={data_dir} '
            '--model_dir={model_dir} '
            '--resnet_depth={depth} '
            '--train_batch_size={train_batch_size} '
            '--eval_batch_size={eval_batch_size} '
            '--iterations_per_loop={iterations} '
            '--data_format={data_format} '
            '--precision={precision} '
            '--skip_host_call={skip_host_call} '
            '--num_train_images={num_train_images} '
            '--num_eval_images={num_eval_images}'.format(
                env_cmd=benchmark_spec.env_cmd,
                script=resnet_benchmark_script,
                use_tpu=bool(benchmark_spec.tpus),
                data_dir=benchmark_spec.data_dir,
                model_dir=benchmark_spec.model_dir,
                depth=benchmark_spec.depth,
                train_batch_size=benchmark_spec.train_batch_size,
                eval_batch_size=benchmark_spec.eval_batch_size,
                iterations=benchmark_spec.iterations,
                data_format=benchmark_spec.data_format,
                precision=benchmark_spec.precision,
                skip_host_call=benchmark_spec.skip_host_call,
                num_train_images=benchmark_spec.num_train_images,
                num_eval_images=benchmark_spec.num_eval_images))
    else:
        resnet_benchmark_script = 'imagenet_main.py'
        resnet_benchmark_cmd = ('{env_cmd} && '
                                'cd models && '
                                'export PYTHONPATH=$(pwd) && '
                                'cd official/r1/resnet && '
                                'python {script} '
                                '--data_dir=/data/imagenet '
                                '--model_dir={model_dir} '
                                '--resnet_size={resnet_size} '
                                '--batch_size={batch_size} '
                                '--data_format={data_format} '.format(
                                    env_cmd=benchmark_spec.env_cmd,
                                    script=resnet_benchmark_script,
                                    model_dir=benchmark_spec.model_dir,
                                    resnet_size=benchmark_spec.depth,
                                    batch_size=benchmark_spec.train_batch_size,
                                    data_format=benchmark_spec.data_format))
        precision = '{precision}'.format(precision=benchmark_spec.precision)
        if precision == 'bfloat16':
            resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
                cmd=resnet_benchmark_cmd)
        else:
            resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
                cmd=resnet_benchmark_cmd)

        if cuda_toolkit.CheckNvidiaGpuExists(vm):
            resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
                env=tensorflow.GetEnvironmentVars(vm),
                cmd=resnet_benchmark_cmd,
                num_gpus=cuda_toolkit.QueryNumberOfGpus(vm))

    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    elapsed_seconds = 0
    steps_per_eval = benchmark_spec.steps_per_eval
    train_steps = benchmark_spec.train_steps
    for step in range(steps_per_eval, train_steps + steps_per_eval,
                      steps_per_eval):
        step = min(step, train_steps)
        resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
            cmd=resnet_benchmark_cmd, step=step)

        if benchmark_spec.mode in ('train', 'train_and_eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['train'].GetName()
                num_cores = '--num_cores={}'.format(
                    benchmark_spec.tpu_groups['train'].GetNumShards())
                resnet_benchmark_train_cmd = (
                    '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                        cmd=resnet_benchmark_cmd_step,
                        tpu=tpu,
                        num_cores=num_cores))
            else:
                resnet_benchmark_train_cmd = (
                    '{cmd} --max_train_steps={max_train_steps} '
                    '--train_epochs={train_epochs} --noeval_only'.format(
                        cmd=resnet_benchmark_cmd,
                        train_epochs=benchmark_spec.epochs_per_eval,
                        max_train_steps=step))

            start = time.time()
            stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                                    should_log=True)
            elapsed_seconds += (time.time() - start)
            samples.extend(
                mnist_benchmark.MakeSamplesFromTrainOutput(
                    metadata, stdout + stderr, elapsed_seconds, step))

        if benchmark_spec.mode in ('train_and_eval', 'eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['eval'].GetName()
                num_cores = '--num_cores={}'.format(
                    benchmark_spec.tpu_groups['eval'].GetNumShards())
                resnet_benchmark_eval_cmd = (
                    '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                        cmd=resnet_benchmark_cmd_step,
                        tpu=tpu,
                        num_cores=num_cores))
            else:
                resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format(
                    cmd=resnet_benchmark_cmd))

            stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                                    should_log=True)
            samples.extend(
                MakeSamplesFromEvalOutput(metadata,
                                          stdout + stderr,
                                          elapsed_seconds,
                                          use_tpu=bool(benchmark_spec.tpus)))
    return samples
Exemple #10
0
 def testQueryNumberOfGpus(self):
     vm = mock.MagicMock()
     vm.RemoteCommand = mock.MagicMock(return_value=("count\n8", None))
     self.assertEqual(8, cuda_toolkit.QueryNumberOfGpus(vm))