Beispiel #1
0
def Install(vm):
    """Installs TensorFlow on the VM."""
    has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm)
    tf_pip_package = (FLAGS.tf_gpu_pip_package
                      if has_gpu else FLAGS.tf_cpu_pip_package)

    if has_gpu:
        vm.Install('cuda_toolkit')
        vm.Install('nccl')
        vm.Install('cudnn')

    vm.Install('pip')
    vm.RemoteCommand('sudo pip install requests')
    vm.RemoteCommand('sudo pip install --upgrade absl-py')
    vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package,
                     should_log=True)
    vm.RemoteCommand('sudo pip install --upgrade %s' % FLAGS.t2t_pip_package,
                     should_log=True)
    vm.InstallPackages('git')
    _, _, retcode = vm.RemoteHostCommandWithReturnCode('test -d benchmarks',
                                                       ignore_failure=True,
                                                       suppress_warning=True)
    if retcode != 0:
        vm.RemoteCommand(
            'git clone https://github.com/tensorflow/benchmarks.git',
            should_log=True)
    vm.RemoteCommand('cd benchmarks && git checkout {}'.format(
        FLAGS.tf_cnn_benchmarks_branch))
    if FLAGS.cloud == 'AWS' and FLAGS.tf_data_dir and (
            not FLAGS.tf_use_local_data):
        vm.Install('aws_credentials')
def Prepare(benchmark_spec):
  """Install and set up ResNet on the target vm.

  Args:
    benchmark_spec: The benchmark specification

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
  vm = benchmark_spec.vms[0]

  if (bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm)):
    raise errors.Config.InvalidValue(
        'Invalid configuration. GPUs and TPUs can not both present in the config.'
    )

  mnist_benchmark.Prepare(benchmark_spec)
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)

  vm.Install('pyyaml')
  # To correctly install the requests lib, otherwise the experiment won't run
  vm.RemoteCommand('sudo pip uninstall -y requests')
  vm.RemoteCommand('sudo pip install requests')

  if not benchmark_spec.tpus:
    local_data_path = posixpath.join('/data', 'imagenet')
    vm.RemoteCommand('sudo mkdir -p {data_path} && '
                     'sudo chmod a+w {data_path} && '
                     'gsutil -m cp -r {data_dir}/* {data_path}'.format(
                         data_dir=benchmark_spec.data_dir,
                         data_path=local_data_path))
def _CollectGpuSamples(
        vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]:
    """Run CUDA memcopy on the cluster.

  Args:
    vm: The virtual machine to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    if not nvidia_driver.CheckNvidiaGpuExists(vm):
        return []
    if not nvidia_driver.CheckNvidiaSmiExists(vm):
        return []
    global_metadata = _MetadataFromFlags()
    global_metadata.update(cuda_toolkit.GetMetadata(vm))
    global_cmd = [
        BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}',
        f'--mode={_MODE.value}'
    ]
    if _HTOD.value:
        global_cmd.append('--htod')
    if _DTOH.value:
        global_cmd.append('--dtoh')
    if _DTOD.value:
        global_cmd.append('--dtod')
    if _WC.value:
        global_cmd.append('--wc')

    num_gpus = nvidia_driver.QueryNumberOfGpus(vm)
    devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else [])
    samples = []
    for device in devices:
        cmd = ' '.join(global_cmd + [f'--device={device}'])
        stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode(
            cmd, ignore_failure=True)
        if exit_code:
            logging.warning('Error with getting GPU stats: %s', stderr)
            continue
        results = regex_util.ExtractAllMatches(
            r'bandwidthTest-(\S+), '
            r'Bandwidth = ([\d\.]+) (\S+), '
            r'Time = ([\d\.]+) s, '
            r'Size = (\d+) bytes, '
            r'NumDevsUsed = (\d+)', stdout)

        for metric, bandwidth, unit, time, size, num_devs_used in results:
            metadata = {
                'time': float(time),
                'size': int(size),
                'NumDevsUsed': num_devs_used,
                'device': device,
                'command': cmd,
            }
            metadata.update(global_metadata)
            samples.append(
                sample.Sample(metric, float(bandwidth), unit, metadata))
    return samples
Beispiel #4
0
def _PrepareVm(benchmark_spec, rank):
    vm = benchmark_spec.vms[rank]
    vm.InstallPackages('python3-pip')
    if nvidia_driver.CheckNvidiaGpuExists(vm):
        vm.Install('cuda_toolkit')
        vm.AuthenticateVm()
        vm.Install('openmpi')
        vm.Install('nccl')
    _DownloadData(benchmark_spec, rank)
def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None:
  """Install and set up MLPerf Inference on the target vm.

  Args:
    bm_spec: The benchmark specification

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
  vm = bm_spec.vms[0]

  repository = f'inference_results_{MLPERF_INFERENCE_VERSION}'
  vm.RemoteCommand(
      f'git clone https://github.com/mlcommons/{repository}.git',
      should_log=True)

  makefile = f'{repository}/closed/NVIDIA/Makefile'
  vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile)

  requirements = f'{repository}/closed/NVIDIA/docker/requirements.1'
  vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52',
                      'opencv-python-headless==4.5.3.56', requirements)

  if nvidia_driver.CheckNvidiaGpuExists(vm):
    vm.Install('cuda_toolkit')
    vm.Install('nvidia_driver')
    vm.Install('nvidia_docker')

  benchmark = FLAGS.mlperf_benchmark
  bm_spec.env_cmd = ('export MLPERF_SCRATCH_PATH=/scratch && '
                     f'cd {repository}/closed/NVIDIA')
  vm.RobustRemoteCommand(
      f'{bm_spec.env_cmd} && '
      'make build_docker NO_BUILD=1 && '
      'make docker_add_user && '
      'make launch_docker DOCKER_COMMAND="echo $MLPERF_SCRATCH_PATH" && '
      'make launch_docker DOCKER_COMMAND="ls -al $MLPERF_SCRATCH_PATH" && '
      'make launch_docker DOCKER_COMMAND="make clean" && '
      'make launch_docker DOCKER_COMMAND="make link_dirs" && '
      'make launch_docker DOCKER_COMMAND="ls -al build/"',
      should_log=True)
  vm.RobustRemoteCommand(
      f'{bm_spec.env_cmd} && '
      'make launch_docker DOCKER_COMMAND='
      f'"make download_data BENCHMARKS={benchmark}"',
      should_log=True)
  vm.RobustRemoteCommand(
      f'{bm_spec.env_cmd} && '
      'make launch_docker DOCKER_COMMAND='
      f'"make download_model BENCHMARKS={benchmark}" && '
      'make launch_docker DOCKER_COMMAND='
      f'"make preprocess_data BENCHMARKS={benchmark}" && '
      f'make launch_docker DOCKER_COMMAND="make build"',
      should_log=True)
Beispiel #6
0
def Install(vm):
    """Installs PyTorch on the VM."""
    vm.Install('pip3')
    toolkit = 'cpu'
    if nvidia_driver.CheckNvidiaGpuExists(vm):
        # Translates --cuda_toolkit_version=10.2 to "cu102" for the toolkit to
        # install
        toolkit = f'cu{"".join(FLAGS.cuda_toolkit_version.split("."))}'
    vm.RemoteCommand(f'{FLAGS.torch_env} python3 -m pip install '
                     f'torch=={FLAGS.torch_version}+{toolkit} '
                     f'torchvision=={FLAGS.torchvision_version}+{toolkit} '
                     f'torchaudio=={FLAGS.torchaudio_version} '
                     f'-f {_PYTORCH_WHL}')
def Prepare(benchmark_spec):
    """Install and set up TensorFlow on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    vm_util.RunThreaded(_PrepareVm, vms)
    benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(vms[0])

    if nvidia_driver.CheckNvidiaGpuExists(vms[0]):
        benchmark_spec.gpu_type = nvidia_driver.GetGpuType(vms[0])
def _CreateMetadataDict(benchmark_spec, model, batch_size):
    """Create metadata dict to be used in run results.

  Args:
    benchmark_spec: benchmark spec
    model: model which was run
    batch_size: batch sized used

  Returns:
    metadata dict
  """
    vm = benchmark_spec.vms[0]
    metadata = {}
    if nvidia_driver.CheckNvidiaGpuExists(vm):
        metadata.update(nvidia_driver.GetMetadata(vm))

    metadata['command_line'] = benchmark_spec.tf_cnn_benchmark_cmd
    metadata['cnn_benchmarks_branch'] = benchmark_spec.cnn_benchmarks_branch
    metadata['tensorflow_version'] = benchmark_spec.tensorflow_version
    metadata['tensorflow_cpu_pip_package'] = (
        benchmark_spec.tensorflow_cpu_pip_package)
    metadata['tensorflow_gpu_pip_package'] = (
        benchmark_spec.tensorflow_gpu_pip_package)
    # If we ran a custom command-line through the benchmark_args flag,
    # add the metadata from that command and return. We don't need anymore
    # metadata from this function as it is likely invalid.
    if getattr(benchmark_spec, 'benchmark_args', None):
        metadata.update(
            _GetMetadataFromBenchmarkArgs(benchmark_spec.benchmark_args))
        return metadata

    metadata['model'] = model
    metadata['batch_size'] = batch_size
    metadata['forward_only'] = benchmark_spec.forward_only
    metadata['data_name'] = benchmark_spec.data_name
    metadata['data_dir'] = benchmark_spec.data_dir
    metadata['use_local_data'] = benchmark_spec.use_local_data
    metadata['variable_update'] = benchmark_spec.variable_update
    metadata['local_parameter_device'] = benchmark_spec.local_parameter_device
    metadata['device'] = benchmark_spec.device
    metadata['data_format'] = benchmark_spec.data_format
    metadata['distortions'] = benchmark_spec.distortions
    metadata['distributed'] = benchmark_spec.distributed
    metadata['precision'] = benchmark_spec.precision
    metadata['num_gpus'] = benchmark_spec.num_gpus
    return metadata
Beispiel #9
0
def Prepare(benchmark_spec):
    """Install and set up RoBERTa mmlm on the target vm..

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    benchmark_spec.always_call_cleanup = True
    list_params = [((benchmark_spec, rank), {})
                   for rank in range(benchmark_spec.num_vms)]
    vm_util.RunThreaded(_PrepareVm, list_params)
    master = vms[0]
    if nvidia_driver.CheckNvidiaGpuExists(master):
        gpus_per_vm = nvidia_driver.QueryNumberOfGpus(master)
        hpc_util.CreateMachineFile(vms, lambda _: gpus_per_vm, HOSTFILE)
def PrepareBenchmark(benchmark_spec, vm=None):
    """Install and set up MLPerf on the target vm.

  Args:
    benchmark_spec: The benchmark specification
    vm: The VM to work on

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = vm or benchmark_spec.vms[0]

    if (bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm)):
        raise errors.Config.InvalidValue(
            'Invalid configuration. GPUs and TPUs can not both present in the config.'
        )

    vm.RemoteCommand(
        f'if [ ! -d "$HOME/training_results_{MLPERF_VERSION}" ]; then '
        f'  git clone https://github.com/mlcommons/training_results_{MLPERF_VERSION}.git ; '
        'fi',
        should_log=True)
    vm.Install('pip3')
    if not HYPERTHREADS.value:
        if BERT in benchmark_spec.benchmark:
            vm_util.ReplaceText(
                vm, "'bind_pyt'", "'bind_pyt' '--no_hyperthreads'",
                f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/'
                'implementations/pytorch/run_with_docker.sh')
        elif MASK in benchmark_spec.benchmark:
            vm_util.ReplaceText(
                vm, "'bind_launch'", "'bind_launch' '--no_hyperthreads'",
                f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/'
                'implementations/pytorch/run_and_time.sh')
        elif RESNET in benchmark_spec.benchmark:
            vm_util.ReplaceText(
                vm, '--cpu=exclusive', '--cpu=exclusive,nosmt',
                f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/'
                'implementations/mxnet/run_and_time.sh')
Beispiel #11
0
def GetEnvironmentVars(vm):
    """Return a string containing TensorFlow-related environment variables.

  Args:
    vm: vm to get environment varibles

  Returns:
    string of environment variables
  """
    env_vars = []
    if nvidia_driver.CheckNvidiaGpuExists(vm):
        output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True)
        long_bit = output.strip()
        lib_name = 'lib' if long_bit == '32' else 'lib64'
        env_vars.extend([
            'PATH=%s${PATH:+:${PATH}}' %
            posixpath.join(cuda_toolkit.CUDA_HOME, 'bin'),
            'CUDA_HOME=%s' % cuda_toolkit.CUDA_HOME,
            'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' %
            posixpath.join(cuda_toolkit.CUDA_HOME, lib_name)
        ])
    if FLAGS.aws_s3_region:
        env_vars.append('AWS_REGION={}'.format(FLAGS.aws_s3_region))
    return ' '.join(env_vars)
def Run(benchmark_spec):
    """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    master_vm = vms[0]
    benchmark = benchmark_spec.benchmark

    env_params = {}
    env_params['SLURM_JOB_ID'] = r'{uri}'.format(uri=FLAGS.run_uri)
    env_params['PULL'] = 0
    env_params['DGXSYSTEM'] = DGXSYSTEM
    env_params['NEXP'] = 1
    env_params['LOGDIR'] = posixpath.join(vm_util.VM_TMP_DIR, benchmark)

    script_path = ('$HOME/training_results_{version}/NVIDIA/benchmarks/{model}'
                   r'/implementations/{framework}'.format(
                       version=mlperf_benchmark.MLPERF_VERSION,
                       model='maskrcnn'
                       if mlperf_benchmark.MASK in benchmark else benchmark,
                       framework='mxnet'
                       if mlperf_benchmark.RESNET in benchmark else 'pytorch'))

    benchmark_env_params = {
        mlperf_benchmark.TRANSFORMER: {
            'CONT': r'"mlperf-nvidia:translation"',
            'DATADIR': r'/data/wmt/utf8'
        },
        mlperf_benchmark.SSD: {
            'CONT': r'"mlperf-nvidia:single_stage_detector"',
            'DATADIR': '/data'
        },
        mlperf_benchmark.GNMT: {
            'CONT': r'"mlperf-nvidia:rnn_translator"',
            'DATADIR': r'/data/gnmt'
        },
        mlperf_benchmark.MASK: {},
        mlperf_benchmark.RESNET: {},
        mlperf_benchmark.BERT: {},
    }
    env_params.update(benchmark_env_params.get(benchmark, {}))
    if mlperf_benchmark.RESNET in benchmark:
        env_params['SLURM_JOB_NUM_NODES'] = benchmark_spec.num_vms

    env = r''
    if nvidia_driver.CheckNvidiaGpuExists(master_vm):
        env = tensorflow.GetEnvironmentVars(master_vm)

    cmd = (f'cd {script_path} && '
           f'{env} {_DictToString(env_params)} '
           f'{FLAGS.nccl_mpi} '
           '--allow-run-as-root '
           '-hostfile $HOME/HOSTFILE '
           '--mca pml ^cm '
           '--mca btl tcp,self '
           '--mca btl_tcp_if_exclude docker0,lo '
           '--bind-to none '
           '-N 1 '
           './run_with_docker1.sh')
    if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
            or FLAGS.mlperf_keep_nccl_log):
        cmd += (r' && cp /tmp/pkb/cmd* {logdir}'.format(
            logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark)))

    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    stdout, _ = master_vm.RobustRemoteCommand(cmd, should_log=True)
    if mlperf_benchmark.NONE in FLAGS.mlperf_profiler:
        samples.extend(MakeSamplesFromOutput(metadata, stdout,
                                             model=benchmark))

    if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
            or FLAGS.mlperf_keep_nccl_log):
        master_vm.RemoteCommand(
            r'mkdir -p /data/aggregated/{model}'.format(model=benchmark))
        master_vm.RemoteCommand(
            r'mpirun -hostfile $HOME/{hostfile} -N 1 scp -r {logdir} '
            r'{master_ip}:/data/aggregated/'.format(
                hostfile=HOSTFILE,
                logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark),
                master_ip=master_vm.internal_ip))

    return samples
def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None:
  """Installs and sets up MLPerf Inference on the target vm.

  Args:
    bm_spec: The benchmark specification

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
  vm = bm_spec.vms[0]

  repository = f'inference_results_{MLPERF_INFERENCE_VERSION}'
  vm.RemoteCommand(f'git clone https://github.com/mlcommons/{repository}.git')

  makefile = f'{repository}/closed/NVIDIA/Makefile'
  vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile)

  requirements1 = f'{repository}/closed/NVIDIA/docker/requirements.1'
  vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52',
                      'opencv-python-headless==4.5.3.56', requirements1)
  requirements2 = f'{repository}/closed/NVIDIA/docker/requirements.2'

  benchmark = FLAGS.mlperf_benchmark
  if _SERVER_TARGET_QPS.value:
    config = f'{repository}/closed/NVIDIA/configs/{benchmark}/Server/__init__.py'
    vm_util.ReplaceText(vm, 'server_target_qps = .*',
                        f'server_target_qps = {_SERVER_TARGET_QPS.value}',
                        config)

  for requirements in (requirements1, requirements2):
    vm_util.ReplaceText(vm, 'git:', 'https:', requirements)

  if nvidia_driver.CheckNvidiaGpuExists(vm):
    vm.Install('cuda_toolkit')
    vm.Install('nvidia_driver')
    vm.Install('nvidia_docker')

  bm_spec.env_cmd = (f'export MLPERF_SCRATCH_PATH={_MLPERF_SCRATCH_PATH} && '
                     f'cd {repository}/closed/NVIDIA')
  docker.AddUser(vm)
  vm.RobustRemoteCommand(
      f'{bm_spec.env_cmd} && '
      'make build_docker NO_BUILD=1 && '
      'make docker_add_user && '
      'make launch_docker DOCKER_COMMAND="make clean" && '
      'make launch_docker DOCKER_COMMAND="make link_dirs"',
      should_log=True)
  if benchmark == mlperf_benchmark.DLRM:
    # Download data
    data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', _DLRM_DATA_MODULE)
    vm.DownloadPreprovisionedData(data_dir, _DLRM_DATA_MODULE, _DLRM_DATA)
    vm.RemoteCommand(f'cd {data_dir} && gzip -d {_DLRM_DATA}')

    # Download model
    model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark)
    vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_MODEL)
    vm.RemoteCommand(f'cd {model_dir} && '
                     f'tar -zxvf {_DLRM_MODEL} && '
                     f'rm -f {_DLRM_MODEL}')
    vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_ROW_FREQ)

    # Preprocess Data
    preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH,
                                           'preprocessed_data',
                                           _DLRM_DATA_MODULE)
    vm.DownloadPreprovisionedData(preprocessed_data_dir, _DLRM_DATA_MODULE,
                                  _DLRM_PREPROCESSED_DATA)
    vm.RemoteCommand(f'cd {preprocessed_data_dir} && '
                     f'tar -zxvf {_DLRM_PREPROCESSED_DATA} && '
                     f'rm -f {_DLRM_PREPROCESSED_DATA}')
  elif benchmark == mlperf_benchmark.BERT:
    # Download data
    data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', 'squad')
    vm.DownloadPreprovisionedData(data_dir, benchmark, 'dev-v1.1.json')

    # Download model
    model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark)
    vm.DownloadPreprovisionedData(model_dir, benchmark, 'bert_large_v1_1.onnx')
    vm.DownloadPreprovisionedData(model_dir, benchmark,
                                  'bert_large_v1_1_fake_quant.onnx')
    vm.DownloadPreprovisionedData(model_dir, benchmark, 'vocab.txt')

    # Preprocess Data
    preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH,
                                           'preprocessed_data',
                                           'squad_tokenized')
    vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark,
                                  'input_ids.npy')
    vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark,
                                  'input_mask.npy')
    vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark,
                                  'segment_ids.npy')
  else:
    vm.RobustRemoteCommand(
        f'{bm_spec.env_cmd} && '
        'make launch_docker DOCKER_COMMAND='
        f'"make download_data BENCHMARKS={benchmark}"',
        should_log=True)
    vm.RobustRemoteCommand(
        f'{bm_spec.env_cmd} && '
        'make launch_docker DOCKER_COMMAND='
        f'"make download_model BENCHMARKS={benchmark}"',
        should_log=True)
    vm.RobustRemoteCommand(
        f'{bm_spec.env_cmd} && '
        'make launch_docker DOCKER_COMMAND='
        f'"make preprocess_data BENCHMARKS={benchmark}"',
        should_log=True)

  vm.RobustRemoteCommand(
      f'{bm_spec.env_cmd} && '
      'make launch_docker DOCKER_COMMAND='
      '"make build" && '
      'make launch_docker DOCKER_COMMAND='
      '"make generate_engines RUN_ARGS=\''
      f'--benchmarks={FLAGS.mlperf_benchmark} '
      f'--scenarios={_SCENARIOS.value}\'"',
      should_log=True)
def _GetTfCnnBenchmarkCommand(vm,
                              model,
                              batch_size,
                              benchmark_spec,
                              args='',
                              job_name=''):
    """Create the command used to run the tf_cnn_benchmarks script.

  The command is either formulated using flag values stored on the
  benchmark_spec, or is essentially provided outright through the
  benchmark_args flag.

  Args:
    vm: the VM to run on.
    model: name of the model to run.
    batch_size: batch size to use for training.
    benchmark_spec: the benchmark spec object.
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    A string that runs the tf_cnn_benchmarks.py script
    with the desired arguments.
  """
    num_gpus = (nvidia_driver.QueryNumberOfGpus(vm)
                if nvidia_driver.CheckNvidiaGpuExists(vm) else 0)
    benchmark_spec.num_gpus = num_gpus

    if benchmark_spec.benchmark_args is not None:
        cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args
        # If the user didn't specify num_gpus in the benchmark_args string,
        # use all the GPUs on the system.
        if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus:
            cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd,
                                                       num_gpus=num_gpus)
        return cmd

    benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device
    benchmark_spec.device = FLAGS.tf_device
    benchmark_spec.data_format = FLAGS.tf_data_format
    if num_gpus == 0:
        benchmark_spec.local_parameter_device = CPU
        benchmark_spec.device = CPU
        benchmark_spec.data_format = NHWC

    cmd = ('{env_vars} python tf_cnn_benchmarks.py '
           '--local_parameter_device={local_parameter_device} '
           '--batch_size={batch_size} '
           '--model={model} '
           '{data} '
           '--data_name={data_name} '
           '--variable_update={variable_update} '
           '--distortions={distortions} '
           '--device={device} '
           '--data_format={data_format} '
           '--forward_only={forward_only} '
           '--use_fp16={use_fp16} '
           '{num_gpus} '
           '{job_name}'.format(
               env_vars=tensorflow.GetEnvironmentVars(vm),
               local_parameter_device=benchmark_spec.local_parameter_device,
               batch_size=batch_size,
               model=model,
               data=('--data_dir={}'.format(benchmark_spec.data_dir)
                     if benchmark_spec.data_dir else ''),
               data_name=benchmark_spec.data_name,
               variable_update=benchmark_spec.variable_update,
               distortions=benchmark_spec.distortions,
               device=benchmark_spec.device,
               data_format=benchmark_spec.data_format,
               forward_only=benchmark_spec.forward_only,
               use_fp16=(benchmark_spec.precision == FP16),
               num_gpus='--num_gpus={}'.format(num_gpus) if num_gpus else '',
               job_name='--job_name={0} {1}'.format(job_name, args)
               if args else ''))
    return cmd
def Run(benchmark_spec):
    """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        # For MLPerf 1.0, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'
                .format(version=MLPERF_VERSION,
                        model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
            code_path = (
                '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
                .format(version=MLPERF_VERSION,
                        model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))

            if MASK in benchmark_spec.benchmark:
                model = 'mask_rcnn'
            elif GNMT in benchmark_spec.benchmark:
                model = 'nmt'
            else:
                model = benchmark_spec.benchmark

            mlperf_benchmark_cmd = (
                'cd {code_path} && '
                'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
                'cd {model} && '
                '{run_path}/run_and_time.sh'.format(code_path=code_path,
                                                    model=model,
                                                    run_path=run_path))

            if SSD in benchmark_spec.benchmark:
                mlperf_benchmark_cmd = (
                    'export '
                    'MLP_GCS_RESNET_CHECKPOINT={checkpoint}'
                    ' && {cmd}'.format(
                        checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint,
                        cmd=mlperf_benchmark_cmd))
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

    else:
        run_sub_paths = {
            RESNET: 'resnet/implementations/mxnet',
            TRANSFORMER: 'transformer/implementations/pytorch',
            MINIGO: 'minigo/implementations/tensorflow',
            MASK: 'maskrcnn/implementations/pytorch',
            GNMT: 'gnmt/implementations/pytorch',
            SSD: 'ssd/implementations/pytorch',
            BERT: 'bert/implementations/pytorch',
        }
        benchmark_path = f'$HOME/training_results_{MLPERF_VERSION}/NVIDIA/benchmarks'
        run_path = posixpath.join(benchmark_path,
                                  run_sub_paths[benchmark_spec.benchmark])
        env = {
            'DGXSYSTEM': DGXSYSTEM,
            'NEXP': 1,
            'PULL': 0,
            'LOGDIR': f'/tmp/{benchmark_spec.benchmark}',
        }
        envs = {
            RESNET: {},
            TRANSFORMER: {
                'DATADIR': '/data/wmt/utf8'
            },
            MINIGO: {
                'CONT': 'mlperf-nvidia:minigo'
            },
            MASK: {},
            GNMT: {
                'DATADIR': '/data/gnmt'
            },
            SSD: {
                'DATADIR': '/data'
            },
            BERT: {}
        }
        env.update(envs[benchmark_spec.benchmark])

        run_script = posixpath.join(run_path, 'run_with_docker.sh')
        vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
        vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t',
                            run_script)
        vm_util.ReplaceText(vm, 'nvidia-docker', 'sudo nvidia-docker',
                            run_script)
        vm_util.ReplaceText(vm, 'docker exec', 'sudo docker exec', run_script)
        vm_util.ReplaceText(vm, 'docker container', 'sudo docker container',
                            run_script)
        if benchmark_spec.benchmark == MASK:
            vm_util.ReplaceText(
                vm, r'_cont_mounts=\(',
                r'_cont_mounts=\(\"--volume=\${PKLDIR}:\/pkl_coco\" ',
                run_script)

        env = ' '.join(f'{key}={value}' for key, value in env.items())
        if nvidia_driver.CheckNvidiaGpuExists(vm):
            env = f'{tensorflow.GetEnvironmentVars(vm)} {env}'

        mlperf_benchmark_cmd = (f'chmod 755 {run_script} && '
                                f'cd {run_path} && '
                                f'{env} {run_script}')

    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True)
    if NONE in FLAGS.mlperf_profiler:
        samples.extend(
            MakeSamplesFromOutput(metadata,
                                  stdout,
                                  use_tpu=bool(benchmark_spec.tpus),
                                  model=benchmark_spec.benchmark))
    return samples
def PrepareRunner(benchmark_spec, vm=None):
    """Install and set up MLPerf on the target vm.

  Args:
    benchmark_spec: The benchmark specification
    vm: The VM to work on

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    vm = vm or benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        if vm == benchmark_spec.vms[0]:
            storage_service = gcs.GoogleCloudStorageService()
            benchmark_spec.storage_service = storage_service
            if FLAGS.mlperf_bucket:
                bucket = FLAGS.mlperf_bucket
                benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}'
            else:
                bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri)
                benchmark_spec.model_dir = f'gs://{bucket}'

            benchmark_spec.bucket = bucket
            location = benchmark_spec.tpu_groups['train'].GetZone()
            storage_service.PrepareService(util.GetRegionFromZone(location))
            storage_service.MakeBucket(bucket)
            storage_service.AclBucket(benchmark_spec.gcp_service_account,
                                      gcs.WRITER, bucket)

        # For MLPerf 1.0, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'
                .format(version=MLPERF_VERSION,
                        model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

        if MASK in benchmark_spec.benchmark:
            model = 'mask_rcnn'
        elif GNMT in benchmark_spec.benchmark:
            model = 'nmt'
        else:
            model = benchmark_spec.benchmark

        code_path = (
            '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
            .format(
                version=MLPERF_VERSION,
                model=benchmark_spec.benchmark,
                tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

        vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
        vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
        if (MASK in benchmark_spec.benchmark
                or SSD in benchmark_spec.benchmark):
            # Install the coco package, to load the coco dataset for Mask-RCNN
            # and SSD benchmarks.
            # TODO(user): coco whl package for python 3.5
            vm.RemoteCommand(
                'cd /tmp && '
                f'wget https://storage.cloud.google.com/mlperf_artifcats/{MLPERF_VERSION}_training/coco-1.1-cp36-cp36m-linux_x86_64.whl'
            )

        setup_script = posixpath.join(run_path, 'setup.sh')
        vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script)
        vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script)
        vm.RemoteCommand(
            'chmod 755 {script} && {script}'.format(script=setup_script))

        if MASK not in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'pip3 uninstall -y tf-estimator-nightly && '
                'pip3 install tf-estimator-nightly==1.14.0.dev2019051801')

        if RESNET in benchmark_spec.benchmark:
            data_dir = benchmark_spec.imagenet_data_dir
        elif TRANSFORMER in benchmark_spec.benchmark:
            data_dir = benchmark_spec.wmt_data_dir
        elif MASK in benchmark_spec.benchmark:
            data_dir = benchmark_spec.coco_data_dir
        elif GNMT in benchmark_spec.benchmark:
            data_dir = benchmark_spec.gnmt_data_dir
        elif SSD in benchmark_spec.benchmark:
            data_dir = benchmark_spec.coco_data_dir
        elif BERT in benchmark_spec.benchmark:
            data_dir = benchmark_spec.bert_data_dir
        else:
            raise ValueError(
                'Unknown operation, cannot find {} in benchmark'.format(
                    benchmark_spec.benchmark))

        run_script = posixpath.join(run_path, 'run_and_time.sh')
        data_dir = data_dir.replace('/', r'\/')
        checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/')
        decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/')
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        vm_util.ReplaceText(vm, '--model_dir=.*',
                            r'--model_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--data_dir=.*',
                            r'--data_dir={} \\\\'.format(data_dir), run_script)
        vm_util.ReplaceText(
            vm, '--training_file_pattern=.*',
            r'--training_file_pattern={}\/train-* \\\\'.format(data_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--validation_file_pattern=.*',
            r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--val_json_file=.*',
            r'--val_json_file={}\/instances_val2017.json \\\\'.format(
                data_dir), run_script)
        vm_util.ReplaceText(vm, '--resnet_checkpoint=.*',
                            r'--resnet_checkpoint={} \\\\'.format(checkpoint),
                            run_script)
        vm_util.ReplaceText(
            vm, '--decode_from_file=.*',
            r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--decode_reference=.*',
            r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--decode_to_file=.*',
            r'--decode_to_file={}\/decode.transformer_mlperf_tpu.'
            r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket),
            run_script)
        vm_util.ReplaceText(vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu),
                            run_script)
        vm_util.ReplaceText(vm, '--output_dir=.*',
                            r'--output_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--cloud_tpu_name=.*',
                            r'--cloud_tpu_name={} \\\\'.format(tpu),
                            run_script)
        vm_util.ReplaceText(vm, '--out_dir=.*',
                            r'--out_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--tpu_name=.*',
                            r'--tpu_name={} \\\\'.format(tpu), run_script)
        vm.RemoteCommand('chmod 755 {}'.format(run_script))

        if GNMT in benchmark_spec.benchmark:
            metric_script = posixpath.join(code_path, model, 'metric.py')
            vm_util.ReplaceText(vm, ' sacrebleu -t',
                                ' python3 -m sacrebleu -t', metric_script)
    else:
        benchmark_spec.model_dir = '/tmp'

        has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm)
        if has_gpu:
            vm.Install('cuda_toolkit')

        vm.Install('nvidia_docker')
        vm.RemoteCommand(
            'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')

        if RESNET in benchmark_spec.benchmark:
            run_script = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet/run_and_time.sh'
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet &&'
                ' sudo docker build --network=host . -t mlperf-nvidia:image_classification',
                should_log=True)
            _DownloadData(benchmark_spec.imagenet_data_dir,
                          posixpath.join('/data', 'imagenet'), vm)

        if TRANSFORMER in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/transformer/implementations/pytorch &&'
                ' sudo docker build --network=host . -t mlperf-nvidia:translation',
                should_log=True)
            _DownloadData(benchmark_spec.wmt_data_dir,
                          posixpath.join('/data', 'wmt'), vm)

        if MINIGO in benchmark_spec.benchmark:
            build_path = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/minigo/implementations/tensorflow'
            run_script = posixpath.join(build_path, 'run_and_time.sh')
            vm_util.ReplaceText(
                vm, 'get_data.py', 'get_data.py --src_dir={}'.format(
                    FLAGS.minigo_model_dir.replace('/', r'\/')), run_script)
            vm.RemoteCommand('cd {} && sudo docker build --network=host -t '
                             'mlperf-nvidia:minigo .'.format(build_path),
                             should_log=True)

        if MASK in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:object_detection . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if GNMT in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/gnmt/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ',
                should_log=True)
            _DownloadData(benchmark_spec.gnmt_data_dir,
                          posixpath.join('/data', 'gnmt'), vm)

        if SSD in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/ssd/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if BERT in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:language_model . ',
                should_log=True)
            _DownloadData(benchmark_spec.bert_data_dir,
                          posixpath.join('/data', 'bert_data'), vm)
def Run(benchmark_spec):
    """Run MNIST on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]

    if benchmark_spec.tpus:
        mnist_benchmark_script = 'mnist_tpu.py'
        mnist_benchmark_cmd = ('cd tpu/models && '
                               'export PYTHONPATH=$(pwd) && '
                               'cd official/mnist && '
                               'python {script} '
                               '--data_dir={data_dir} '
                               '--iterations={iterations} '
                               '--model_dir={model_dir} '
                               '--batch_size={batch_size}'.format(
                                   script=mnist_benchmark_script,
                                   data_dir=benchmark_spec.data_dir,
                                   iterations=benchmark_spec.iterations,
                                   model_dir=benchmark_spec.model_dir,
                                   batch_size=benchmark_spec.batch_size))
    else:
        mnist_benchmark_script = 'mnist.py'
        mnist_benchmark_cmd = ('cd models && '
                               'export PYTHONPATH=$(pwd) && '
                               'cd official/mnist && '
                               'python {script} '
                               '--data_dir={data_dir} '
                               '--model_dir={model_dir} '
                               '--batch_size={batch_size} '.format(
                                   script=mnist_benchmark_script,
                                   data_dir=benchmark_spec.data_dir,
                                   model_dir=benchmark_spec.model_dir,
                                   batch_size=benchmark_spec.batch_size))

    if nvidia_driver.CheckNvidiaGpuExists(vm):
        mnist_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd)
    samples = []
    metadata = CreateMetadataDict(benchmark_spec)

    if benchmark_spec.train_steps > 0:
        if benchmark_spec.tpus:
            tpu = benchmark_spec.tpu_groups['train'].GetName()
            num_shards = '--num_shards={}'.format(
                benchmark_spec.tpu_groups['train'].GetNumShards())
        else:
            tpu = num_shards = ''

        if benchmark_spec.tpus:
            mnist_benchmark_train_cmd = (
                '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} '
                '{num_shards} --noenable_predict'.format(
                    cmd=mnist_benchmark_cmd,
                    tpu=tpu,
                    use_tpu=bool(benchmark_spec.tpus),
                    train_steps=benchmark_spec.train_steps,
                    num_shards=num_shards))
        else:
            mnist_benchmark_train_cmd = (
                '{cmd} --train_epochs={train_epochs} '.format(
                    cmd=mnist_benchmark_cmd,
                    train_epochs=benchmark_spec.train_epochs))

        start = time.time()
        stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd,
                                                should_log=True)
        elapsed_seconds = (time.time() - start)
        samples.extend(
            MakeSamplesFromTrainOutput(metadata, stdout + stderr,
                                       elapsed_seconds,
                                       benchmark_spec.train_steps))

    if benchmark_spec.eval_steps > 0:
        if benchmark_spec.tpus:
            mnist_benchmark_eval_cmd = (
                '{cmd} --tpu={tpu} --use_tpu={use_tpu} --eval_steps={eval_steps}'
                .format(cmd=mnist_benchmark_cmd,
                        use_tpu=bool(benchmark_spec.tpus),
                        tpu=benchmark_spec.tpu_groups['eval'].GetName(),
                        eval_steps=benchmark_spec.eval_steps))
        else:
            mnist_benchmark_eval_cmd = (
                '{cmd} --eval_steps={eval_steps}'.format(
                    cmd=mnist_benchmark_cmd,
                    eval_steps=benchmark_spec.eval_steps))

        stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd,
                                                should_log=True)
        samples.extend(
            MakeSamplesFromEvalOutput(metadata, stdout + stderr,
                                      elapsed_seconds))
    return samples
Beispiel #18
0
def Run(benchmark_spec):
  """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    # For MLPerf v0.6, the benchmake code of different hardware are different.
    if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'):
      run_path = (
          '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}'
          .format(
              model=benchmark_spec.benchmark,
              tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))
      code_path = (
          '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
          .format(
              model=benchmark_spec.benchmark,
              tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

      if 'mask' in benchmark_spec.benchmark:
        model = 'mask_rcnn'
      elif 'gnmt' in benchmark_spec.benchmark:
        model = 'nmt'
      else:
        model = benchmark_spec.benchmark

      mlperf_benchmark_cmd = ('cd {code_path} && '
                              'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
                              'cd {model} && '
                              '{run_path}/run_and_time.sh'.format(
                                  code_path=code_path,
                                  model=model,
                                  run_path=run_path))

      if 'ssd' in benchmark_spec.benchmark:
        mlperf_benchmark_cmd = (
            'export '
            'MLP_GCS_RESNET_CHECKPOINT={checkpoint}'
            ' && {cmd}'.format(
                checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint,
                cmd=mlperf_benchmark_cmd))
    else:
      raise ValueError(
          'MLPerf configurations do not support the hardware in PKB. PKB may '
          'need to be updated if this is a new TPU type.')

  else:
    benchmark_path = '$HOME/training_results_v0.6/NVIDIA/benchmarks'
    common_env = 'DGXSYSTEM=DGX1 NEXP=1'
    if 'resnet' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path, 'resnet/implementations/mxnet')
      env = 'DATADIR=/data/imagenet LOGDIR=/tmp/resnet PULL=0'
    elif 'transformer' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path,
                                'transformer/implementations/pytorch')
      env = 'DATADIR=/data/wmt/utf8 LOGDIR=/tmp/transformer PULL=0'
    elif 'minigo' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path,
                                'minigo/implementations/tensorflow')
      env = 'LOGDIR=/tmp/minigo CONT=mlperf-nvidia:minigo'
    elif 'mask' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path,
                                'maskrcnn/implementations/pytorch')
      env = 'LOGDIR=/tmp/mask DATADIR=/data PULL=0'
    elif 'gnmt' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path, 'gnmt/implementations/pytorch')
      env = 'LOGDIR=/tmp/gnmt DATADIR=/data/gnmt PULL=0'
    elif 'ssd' in benchmark_spec.benchmark:
      run_path = posixpath.join(benchmark_path, 'ssd/implementations/pytorch')
      env = 'LOGDIR=/tmp/ssd DATADIR=/data PULL=0'

    run_script = posixpath.join(run_path, 'run.sub')
    vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
    mlperf_benchmark_cmd = (
        'cd {run_path} && chmod 755 run.sub && sudo {common_env} {env} '
        './run.sub'.format(run_path=run_path, common_env=common_env, env=env))

  if nvidia_driver.CheckNvidiaGpuExists(vm):
    mlperf_benchmark_cmd = '{env} {cmd}'.format(
        env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd)

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True)
  samples.extend(
      MakeSamplesFromOutput(
          metadata,
          stdout,
          use_tpu=bool(benchmark_spec.tpus),
          model=benchmark_spec.benchmark))
  return samples
def Run(benchmark_spec):
  """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    resnet_benchmark_script = 'resnet_main.py'
    resnet_benchmark_cmd = (
        '{env_cmd} && '
        'cd tpu/models && '
        'export PYTHONPATH=$(pwd) &&'
        'cd official/resnet && '
        'python {script} '
        '--use_tpu={use_tpu} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--resnet_depth={depth} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--iterations_per_loop={iterations} '
        '--data_format={data_format} '
        '--precision={precision} '
        '--skip_host_call={skip_host_call} '
        '--num_train_images={num_train_images} '
        '--num_eval_images={num_eval_images}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=resnet_benchmark_script,
            use_tpu=bool(benchmark_spec.tpus),
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            depth=benchmark_spec.depth,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            iterations=benchmark_spec.iterations,
            data_format=benchmark_spec.data_format,
            precision=benchmark_spec.precision,
            skip_host_call=benchmark_spec.skip_host_call,
            num_train_images=benchmark_spec.num_train_images,
            num_eval_images=benchmark_spec.num_eval_images))
  else:
    resnet_benchmark_script = 'imagenet_main.py'
    resnet_benchmark_cmd = ('{env_cmd} && '
                            'cd models && '
                            'export PYTHONPATH=$(pwd) && '
                            'cd official/r1/resnet && '
                            'python {script} '
                            '--data_dir=/data/imagenet '
                            '--model_dir={model_dir} '
                            '--resnet_size={resnet_size} '
                            '--batch_size={batch_size} '
                            '--data_format={data_format} '.format(
                                env_cmd=benchmark_spec.env_cmd,
                                script=resnet_benchmark_script,
                                model_dir=benchmark_spec.model_dir,
                                resnet_size=benchmark_spec.depth,
                                batch_size=benchmark_spec.train_batch_size,
                                data_format=benchmark_spec.data_format))
    precision = '{precision}'.format(precision=benchmark_spec.precision)
    if precision == 'bfloat16':
      resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
          cmd=resnet_benchmark_cmd)
    else:
      resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
          cmd=resnet_benchmark_cmd)

    if nvidia_driver.CheckNvidiaGpuExists(vm):
      resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
          env=tensorflow.GetEnvironmentVars(vm),
          cmd=resnet_benchmark_cmd,
          num_gpus=nvidia_driver.QueryNumberOfGpus(vm))

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  elapsed_seconds = 0
  steps_per_eval = benchmark_spec.steps_per_eval
  train_steps = benchmark_spec.train_steps
  for step in range(steps_per_eval, train_steps + steps_per_eval,
                    steps_per_eval):
    step = min(step, train_steps)
    resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
        cmd=resnet_benchmark_cmd, step=step)

    if benchmark_spec.mode in ('train', 'train_and_eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['train'].GetNumShards())
        resnet_benchmark_train_cmd = (
            '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_train_cmd = (
            '{cmd} --max_train_steps={max_train_steps} '
            '--train_epochs={train_epochs} --noeval_only'.format(
                cmd=resnet_benchmark_cmd,
                train_epochs=benchmark_spec.epochs_per_eval,
                max_train_steps=step))

      start = time.time()
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                              should_log=True)
      elapsed_seconds += (time.time() - start)
      samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput(
          metadata, stdout + stderr, elapsed_seconds, step))

    if benchmark_spec.mode in ('train_and_eval', 'eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['eval'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['eval'].GetNumShards())
        resnet_benchmark_eval_cmd = (
            '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores))
      else:
        resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format(
            cmd=resnet_benchmark_cmd))

      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                              should_log=True)
      samples.extend(
          MakeSamplesFromEvalOutput(
              metadata,
              stdout + stderr,
              elapsed_seconds,
              use_tpu=bool(benchmark_spec.tpus)))
  return samples