Example #1
0
def Install(vm):
    """Installs TensorFlow on the VM."""
    has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
    tf_pip_package = (FLAGS.tf_gpu_pip_package
                      if has_gpu else FLAGS.tf_cpu_pip_package)

    if has_gpu:
        vm.Install('cuda_toolkit')
        vm.Install('cudnn')

        # TODO(ferneyhough): Move NCCL installation to its own package.
        # Currently this is dependent on CUDA 9 being installed.
        vm.RemoteCommand('wget %s' % NCCL_URL)
        vm.RemoteCommand('sudo dpkg -i %s' % NCCL_PACKAGE)
        vm.RemoteCommand('sudo apt install libnccl2=2.3.5-2+cuda9.0 '
                         'libnccl-dev=2.3.5-2+cuda9.0')

    vm.Install('pip')
    vm.RemoteCommand('sudo pip install requests')
    vm.RemoteCommand('sudo pip install --upgrade absl-py')
    vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package,
                     should_log=True)
    vm.RemoteCommand('sudo pip install --upgrade %s' % FLAGS.t2t_pip_package,
                     should_log=True)
    vm.InstallPackages('git')
    vm.RemoteCommand('git clone https://github.com/tensorflow/benchmarks.git',
                     should_log=True)
    vm.RemoteCommand('cd benchmarks && git checkout {}'.format(
        FLAGS.tf_cnn_benchmarks_branch))
Example #2
0
def Run(benchmark_spec):
    """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        # For MLPerf v0.5, the benchmake code of different hardware are different.
        if benchmark_spec.tpu_groups['train'].GetNumShards() > 8:
            code_path = 'cloud_v2.512/resnet-tpuv2-512/code/resnet/model'
        elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v2-8':
            code_path = 'cloud_v2.8/resnet-tpuv2-8/code/resnet/model'
        elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-8':
            code_path = 'cloud_v3.8/resnet-tpuv3-8/code/resnet/model'
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')
        cmd = 'bash run_helper.sh 2>&1 | tee output.txt'
    else:
        code_path = 'cloud_v100x8/code/resnet'
        cmd = ('sudo nvidia-docker build . -t foo && '
               'sudo nvidia-docker run -v $MLP_HOST_DATA_DIR:/data -v '
               '$MLP_HOST_OUTPUT_DIR:/output -v /proc:/host_proc -t '
               'foo:latest run_helper_8xV100.sh 2>&1 | tee output.txt')
    mlperf_benchmark_cmd = (
        'export MLP_GCS_MODEL_DIR={model_dir} && '
        'export MLP_PATH_GCS_IMAGENET={data_dir} && '
        'export MLP_TPU_NAME={tpu_train} && '
        'export MLP_PATH_GCS_EUW_IMAGENET={data_dir} && '
        'export MLP_GCS_EUW_MODEL_DIR={model_dir} && '
        'export MLP_TPU_SIDECAR_NAME={tpu_eval} && '
        'export MLP_HOST_DATA_DIR=/data && '
        'export MLP_HOST_OUTPUT_DIR=`pwd`/output && '
        'export PYTHONPATH=$PYTHONPATH:$PWD/tpu/models && '
        'cd results/v0.5.0/google/{code_path} && '
        'sed -i "s/python /python3 /g" run_helper*.sh && '
        'mkdir -p $MLP_HOST_OUTPUT_DIR && '
        '{cmd}'.format(model_dir=benchmark_spec.model_dir,
                       data_dir=benchmark_spec.data_dir,
                       tpu_train=(benchmark_spec.tpu_groups['train'].GetName()
                                  if benchmark_spec.tpus else ''),
                       tpu_eval=(benchmark_spec.tpu_groups['eval'].GetName()
                                 if benchmark_spec.tpus else ''),
                       code_path=code_path,
                       cmd=cmd))
    if cuda_toolkit.CheckNvidiaGpuExists(vm):
        mlperf_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd)
    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True)
    samples.extend(MakeSamplesFromOutput(metadata, stdout))
    return samples
def Run(benchmark_spec):
    """Run MNIST on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    mnist_benchmark_dir = 'tpu-demos/cloud_tpu/models/mnist'
    mnist_benchmark_cmd = (
        'python mnist.py --master={master} --train_file={train_file} '
        '--use_tpu={use_tpu} '
        '--train_steps={train_steps}'.format(
            master=benchmark_spec.master,
            train_file=benchmark_spec.train_file,
            use_tpu=benchmark_spec.use_tpu,
            train_steps=benchmark_spec.train_steps))
    if benchmark_spec.model_dir:
        mnist_benchmark_cmd = '{cmd} --model_dir {model_dir}'.format(
            cmd=mnist_benchmark_cmd, model_dir=benchmark_spec.model_dir)
    if cuda_toolkit.CheckNvidiaGpuExists(vm):
        mnist_benchmark_cmd = '%s %s' % (tensorflow.GetEnvironmentVars(vm),
                                         mnist_benchmark_cmd)
    run_command = 'cd %s && %s' % (mnist_benchmark_dir, mnist_benchmark_cmd)
    stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True)
    return _MakeSamplesFromOutput(benchmark_spec, stdout + stderr)
Example #4
0
def Run(benchmark_spec):
    """Run MNIST on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    mnist_benchmark_script = 'tpu/cloud_tpu/models/mnist/mnist.py'
    mnist_benchmark_cmd = ('python {script} '
                           '--master={master} '
                           '--train_file={train_file} '
                           '--use_tpu={use_tpu} '
                           '--train_steps={train_steps} '
                           '--iterations={iterations} '
                           '--model_dir={model_dir}'.format(
                               script=mnist_benchmark_script,
                               master=benchmark_spec.master,
                               train_file=benchmark_spec.train_file,
                               use_tpu=benchmark_spec.use_tpu,
                               train_steps=benchmark_spec.train_steps,
                               iterations=benchmark_spec.iterations,
                               model_dir=benchmark_spec.model_dir))
    if cuda_toolkit.CheckNvidiaGpuExists(vm):
        mnist_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd)
    stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_cmd,
                                            should_log=True)
    return MakeSamplesFromOutput(_CreateMetadataDict(benchmark_spec),
                                 stdout + stderr)
def Prepare(benchmark_spec):
    """Install and set up ResNet on the target vm.

  Args:
    benchmark_spec: The benchmark specification

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    vm = benchmark_spec.vms[0]

    if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)):
        raise errors.Config.InvalidValue(
            'Invalid configuration. GPUs and TPUs can not both present in the config.'
        )

    mnist_benchmark.Prepare(benchmark_spec)
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)

    vm.Install('pyyaml')
    # To correctly install the requests lib, otherwise the experiment won't run
    vm.RemoteCommand('sudo pip uninstall -y requests')
    vm.RemoteCommand('sudo pip install requests')

    if not benchmark_spec.tpus:
        local_data_path = posixpath.join('/data', 'imagenet')
        vm.RemoteCommand('sudo mkdir -p {data_path} && '
                         'sudo chmod a+w {data_path} && '
                         'gsutil -m cp -r {data_dir}/* {data_path}'.format(
                             data_dir=benchmark_spec.data_dir,
                             data_path=local_data_path))
Example #6
0
def _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus):
  """Create metadata dict to be used in run results.

  Args:
    benchmark_spec: benchmark spec
    model: model which was run
    batch_size: batch sized used
    num_gpus: number of GPUs used

  Returns:
    metadata dict
  """
  vm = benchmark_spec.vms[0]
  metadata = dict()
  if cuda_toolkit.CheckNvidiaGpuExists(vm):
    metadata.update(cuda_toolkit.GetMetadata(vm))
    metadata['num_gpus'] = num_gpus
  metadata['model'] = model
  metadata['batch_size'] = batch_size
  metadata['forward_only'] = benchmark_spec.forward_only
  metadata['data_name'] = benchmark_spec.data_name
  metadata['variable_update'] = benchmark_spec.variable_update
  metadata['local_parameter_device'] = benchmark_spec.local_parameter_device
  metadata['device'] = benchmark_spec.device
  metadata['data_format'] = benchmark_spec.data_format
  metadata['distortions'] = benchmark_spec.distortions
  metadata['benchmarks_commit_hash'] = benchmark_spec.benchmarks_commit_hash
  metadata['tensorflow_version'] = benchmark_spec.tensorflow_version
  metadata['tensorflow_cpu_pip_package'] = (
      benchmark_spec.tensorflow_cpu_pip_package)
  metadata['tensorflow_gpu_pip_package'] = (
      benchmark_spec.tensorflow_gpu_pip_package)
  metadata['distributed'] = benchmark_spec.distributed
  return metadata
Example #7
0
def Run(benchmark_spec):
  """Run MNIST on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  mnist_benchmark_script = 'mnist_tpu.py'
  mnist_benchmark_cmd = (
      'cd models/official/mnist && '
      'python {script} '
      '--data_dir={data_dir} '
      '--iterations={iterations} '
      '--model_dir={model_dir} '
      '--batch_size={batch_size}'.format(
          script=mnist_benchmark_script,
          data_dir=benchmark_spec.data_dir,
          iterations=benchmark_spec.iterations,
          model_dir=benchmark_spec.model_dir,
          batch_size=benchmark_spec.batch_size))
  if cuda_toolkit.CheckNvidiaGpuExists(vm):
    mnist_benchmark_cmd = '{env} {cmd}'.format(
        env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd)
  samples = []
  metadata = CreateMetadataDict(benchmark_spec)
  if benchmark_spec.train_steps:
    if benchmark_spec.tpus:
      tpu = benchmark_spec.tpu_groups['train'].GetName()
      num_shards = '--num_shards={}'.format(
          benchmark_spec.tpu_groups['train'].GetNumShards())
    else:
      tpu = num_shards = ''
    mnist_benchmark_train_cmd = (
        '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} '
        '{num_shards} --noenable_predict'.format(
            cmd=mnist_benchmark_cmd, tpu=tpu, use_tpu=bool(benchmark_spec.tpus),
            train_steps=benchmark_spec.train_steps, num_shards=num_shards))
    start = time.time()
    stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd,
                                            should_log=True)
    elapsed_seconds = (time.time() - start)
    samples.extend(MakeSamplesFromTrainOutput(
        metadata, stdout + stderr, elapsed_seconds, benchmark_spec.train_steps))
  if benchmark_spec.eval_steps:
    mnist_benchmark_eval_cmd = (
        '{cmd} --tpu="" --use_tpu=False --eval_steps={eval_steps}'.format(
            cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps))
    stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd,
                                            should_log=True)
    samples.extend(MakeSamplesFromEvalOutput(metadata, stdout + stderr,
                                             elapsed_seconds))
  return samples
Example #8
0
def Install(vm):
    """Installs TensorFlow on the VM."""
    has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
    tf_pip_package = (FLAGS.tf_gpu_pip_package
                      if has_gpu else FLAGS.tf_cpu_pip_package)

    if has_gpu:
        vm.Install('cuda_toolkit')
        vm.Install('cudnn')

    vm.Install('pip')
    vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package,
                     should_log=True)
def Prepare(benchmark_spec):
    """Install and set up TensorFlow on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    vm_util.RunThreaded(_PrepareVm, vms)
    benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(vms[0])

    if cuda_toolkit.CheckNvidiaGpuExists(vms[0]):
        benchmark_spec.gpu_type = cuda_toolkit.GetGpuType(vms[0])
def _CreateMetadataDict(benchmark_spec, model, batch_size):
    """Create metadata dict to be used in run results.

  Args:
    benchmark_spec: benchmark spec
    model: model which was run
    batch_size: batch sized used

  Returns:
    metadata dict
  """
    vm = benchmark_spec.vms[0]
    metadata = {}
    if cuda_toolkit.CheckNvidiaGpuExists(vm):
        metadata.update(cuda_toolkit.GetMetadata(vm))

    metadata['command_line'] = benchmark_spec.tf_cnn_benchmark_cmd
    metadata['cnn_benchmarks_branch'] = benchmark_spec.cnn_benchmarks_branch
    metadata['tensorflow_version'] = benchmark_spec.tensorflow_version
    metadata['tensorflow_cpu_pip_package'] = (
        benchmark_spec.tensorflow_cpu_pip_package)
    metadata['tensorflow_gpu_pip_package'] = (
        benchmark_spec.tensorflow_gpu_pip_package)
    # If we ran a custom command-line through the benchmark_args flag,
    # add the metadata from that command and return. We don't need anymore
    # metadata from this function as it is likely invalid.
    if getattr(benchmark_spec, 'benchmark_args', None):
        metadata.update(
            _GetMetadataFromBenchmarkArgs(benchmark_spec.benchmark_args))
        return metadata

    metadata['model'] = model
    metadata['batch_size'] = batch_size
    metadata['forward_only'] = benchmark_spec.forward_only
    metadata['data_name'] = benchmark_spec.data_name
    metadata['data_dir'] = benchmark_spec.data_dir
    metadata['use_local_data'] = benchmark_spec.use_local_data
    metadata['variable_update'] = benchmark_spec.variable_update
    metadata['local_parameter_device'] = benchmark_spec.local_parameter_device
    metadata['device'] = benchmark_spec.device
    metadata['data_format'] = benchmark_spec.data_format
    metadata['distortions'] = benchmark_spec.distortions
    metadata['distributed'] = benchmark_spec.distributed
    metadata['precision'] = benchmark_spec.precision
    metadata['num_gpus'] = benchmark_spec.num_gpus
    return metadata
Example #11
0
def Install(vm):
    """Installs TensorFlow on the VM."""
    has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
    tf_pip_package = (FLAGS.tf_gpu_pip_package
                      if has_gpu else FLAGS.tf_cpu_pip_package)
    commit_hash = FLAGS.tf_benchmarks_commit_hash

    if has_gpu:
        vm.Install('cuda_toolkit')
        vm.Install('cudnn')

    vm.Install('pip')
    vm.RemoteCommand('sudo pip install --upgrade absl-py')
    vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package,
                     should_log=True)
    vm.RemoteCommand('git clone https://github.com/tensorflow/benchmarks.git',
                     should_log=True)
    vm.RemoteCommand('cd benchmarks && git checkout {}'.format(commit_hash))
Example #12
0
def GetEnvironmentVars(vm):
    """Return a string containing TensorFlow-related environment variables.

  Args:
    vm: vm to get environment varibles

  Returns:
    string of environment variables
  """
    if not cuda_toolkit.CheckNvidiaGpuExists(vm):
        return ''
    output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True)
    long_bit = output.strip()
    lib_name = 'lib' if long_bit == '32' else 'lib64'
    return ' '.join([
        'PATH=%s${PATH:+:${PATH}}' %
        posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'),
        'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir,
        'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' %
        posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name),
    ])
def GetEnvironmentVars(vm):
  """Return a string containing TensorFlow-related environment variables.

  Args:
    vm: vm to get environment varibles

  Returns:
    string of environment variables
  """
  env_vars = []
  if cuda_toolkit.CheckNvidiaGpuExists(vm):
    output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True)
    long_bit = output.strip()
    lib_name = 'lib' if long_bit == '32' else 'lib64'
    env_vars.extend([
        'PATH=%s${PATH:+:${PATH}}' %
        posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'),
        'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir,
        'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' %
        posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name)])
  if FLAGS.aws_s3_region:
    env_vars.append('AWS_REGION={}'.format(FLAGS.aws_s3_region))
  return ' '.join(env_vars)
def Install(vm):
  """Installs TensorFlow on the VM."""
  has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
  tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else
                    FLAGS.tf_cpu_pip_package)

  if has_gpu:
    vm.Install('cuda_toolkit')
    vm.Install('cudnn')

    # TODO(ferneyhough): Move NCCL installation to its own package.
    # Currently this is dependent on CUDA 9 being installed.
    vm.RemoteCommand('wget %s' % NCCL_URL)
    vm.RemoteCommand('sudo dpkg -i %s' % NCCL_PACKAGE)
    vm.RemoteCommand('sudo apt install libnccl2=2.3.5-2+cuda9.0 '
                     'libnccl-dev=2.3.5-2+cuda9.0')

  vm.Install('pip')
  vm.RemoteCommand('sudo pip install requests')
  vm.RemoteCommand('sudo pip install --upgrade absl-py')
  vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package,
                   should_log=True)
  vm.RemoteCommand(
      'sudo pip install --upgrade %s' % FLAGS.t2t_pip_package, should_log=True)
  vm.InstallPackages('git')
  _, _, retcode = vm.RemoteHostCommandWithReturnCode(
      'test -d benchmarks', ignore_failure=True, suppress_warning=True)
  if retcode != 0:
    vm.RemoteCommand(
        'git clone https://github.com/tensorflow/benchmarks.git',
        should_log=True)
  vm.RemoteCommand(
      'cd benchmarks && git checkout {}'.format(FLAGS.tf_cnn_benchmarks_branch)
  )
  if FLAGS.cloud == 'AWS' and FLAGS.tf_data_dir and (
      not FLAGS.tf_use_local_data):
    vm.Install('aws_credentials')
def Run(benchmark_spec):
    """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        # For MLPerf v0.6, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}'
                .format(model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
            code_path = (
                '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
                .format(model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))

            if 'mask' in benchmark_spec.benchmark:
                model = 'mask_rcnn'
            elif 'gnmt' in benchmark_spec.benchmark:
                model = 'nmt'
            else:
                model = benchmark_spec.benchmark

            mlperf_benchmark_cmd = (
                'cd {code_path} && '
                'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
                'cd {model} && '
                '{run_path}/run_and_time1.sh'.format(code_path=code_path,
                                                     model=model,
                                                     run_path=run_path))

            if 'ssd' in benchmark_spec.benchmark:
                mlperf_benchmark_cmd = (
                    'export '
                    'MLP_GCS_RESNET_CHECKPOINT=gs://download.tensorflow.org/models/mlperf/v0.5.0/resnet34_ssd_checkpoint'
                    ' && {cmd}'.format(cmd=mlperf_benchmark_cmd))

        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

    else:
        if 'resnet' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                'training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet'
                ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&'
                ' chmod 755 ./run1.sub && sudo DATADIR=/data/imagenet '
                'LOGDIR=/tmp/resnet PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ')

        if 'transformer' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                'training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch'
                ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&'
                ' chmod 755 ./run1.sub && sudo DATADIR=/data/wmt/utf8 '
                'LOGDIR=/tmp/transformer PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub '
            )

        if 'minigo' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                '$HOME/training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow'
                ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > run1.sub && '
                'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/minigo '
                'CONT=mlperf-nvidia:minigo DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ')

        if 'mask' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                '$HOME/training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch'
                ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && '
                'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/mask DATADIR=/data PULL=0 '
                'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ')

        if 'gnmt' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                '$HOME/training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch'
                ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && '
                'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/gnmt DATADIR=/data/gnmt '
                'PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ')

        if 'ssd' in benchmark_spec.benchmark:
            mlperf_benchmark_cmd = (
                'cd '
                '$HOME/training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch'
                ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && '
                'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/ssd DATADIR=/data PULL=0 '
                'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ')

    if cuda_toolkit.CheckNvidiaGpuExists(vm):
        mlperf_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd)

    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True)
    samples.extend(
        MakeSamplesFromOutput(metadata,
                              stdout,
                              use_tpu=bool(benchmark_spec.tpus),
                              model=benchmark_spec.benchmark))
    return samples
def Prepare(benchmark_spec, vm=None):
    """Install and set up MLPerf on the target vm.

  Args:
    benchmark_spec: The benchmark specification
    vm: The VM to work on

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    if vm is None:
        vm = benchmark_spec.vms[0]

    if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)):
        raise errors.Config.InvalidValue(
            'Invalid configuration. GPUs and TPUs can not both present in the config.'
        )

    vm.RemoteCommand(
        'if [ ! -d "$HOME/training_results_v0.6" ]; then '
        '  git clone https://github.com/mlperf/training_results_v0.6.git ; '
        'fi',
        should_log=True)
    vm.InstallPackages('python3-pip')

    if benchmark_spec.tpus:
        if vm == benchmark_spec.vms[0]:
            storage_service = gcs.GoogleCloudStorageService()
            benchmark_spec.storage_service = storage_service
            bucket = 'pkb{}'.format(FLAGS.run_uri)
            benchmark_spec.bucket = bucket
            benchmark_spec.model_dir = 'gs://{}'.format(bucket)
            location = benchmark_spec.tpu_groups['train'].GetZone()
            storage_service.PrepareService(util.GetRegionFromZone(location))
            storage_service.MakeBucket(bucket)
            storage_service.ChmodBucket(benchmark_spec.gcp_service_account,
                                        'W', bucket)

        # For MLPerf v0.6, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}'
                .format(model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

        if 'mask' in benchmark_spec.benchmark:
            model = 'mask_rcnn'
        elif 'gnmt' in benchmark_spec.benchmark:
            model = 'nmt'
        else:
            model = benchmark_spec.benchmark

        code_path = (
            '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
            .format(
                model=benchmark_spec.benchmark,
                tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

        vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
        vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
        if ('mask' in benchmark_spec.benchmark
                or 'ssd' in benchmark_spec.benchmark):
            # TODO(b/141876878): coco whl package for python 3.5
            vm.RemoteCommand(
                'cd /tmp && '
                'wget https://storage.cloud.google.com/mlperf_artifcats/v0.6_training/coco-1.1-cp36-cp36m-linux_x86_64.whl'
            )

            vm.RemoteCommand('cd {path} && '
                             'sed "s/--progress-bar off/ /g" ./setup.sh | '
                             'sed "s/pip /pip3 /g" > ./setup1.sh && '
                             'chmod 755 ./setup1.sh && '
                             './setup1.sh'.format(path=run_path))
        else:
            vm.RemoteCommand(
                'cd {path} && '
                'sed "s/--progress-bar off/ /g" ./setup.sh > ./setup1.sh && '
                'chmod 755 ./setup1.sh && '
                './setup1.sh'.format(path=run_path))

        if 'mask' not in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'pip3 uninstall -y tf-estimator-nightly && '
                'pip3 install tf-estimator-nightly==1.14.0.dev2019051801')

        vm.RemoteCommand(
            r'cd {path} && '
            r'sed "s/--model_dir=.*/--model_dir=gs:\/\/{bucket} \\\/g" run_and_time.sh | '
            r'sed "s/--tpu=.*/--tpu={tpu} \\\/g" | '
            r'sed "s/--output_dir=.*/--output_dir=gs:\/\/{bucket} \\\/g" | '
            r'sed "s/--cloud_tpu_name=.*/--cloud_tpu_name={tpu} \\\/g" | '
            r'sed "s/--out_dir=.*/--out_dir=gs:\/\/{bucket} \\\/g" | '
            r'sed "s/--tpu_name=.*/--tpu_name={tpu} \\\/g" > run_and_time1.sh && '
            r'chmod 755 run_and_time1.sh '.format(
                path=run_path,
                bucket=bucket,
                tpu=benchmark_spec.tpu_groups['train'].GetName()))

        if 'gnmt' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd {code_path}/{model} && '
                'cp metric.py metric0.py && '
                'sed "s/ sacrebleu -t/ python3 -m sacrebleu -t/g" metric0.py > metric.py'
                .format(code_path=code_path, model=model))

    else:
        benchmark_spec.model_dir = '/tmp'

        has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
        if has_gpu:
            vm.Install('cuda_toolkit')

        vm.Install('nvidia_docker')
        vm.RemoteCommand(
            'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')

        if 'resnet' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet &&'
                ' sudo docker build --pull --network=host . -t mlperf-nvidia:image_classification',
                should_log=True)
            _DownloadData(benchmark_spec.imagenet_data_dir,
                          posixpath.join('/data', 'imagenet'), vm)

        if 'transformer' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch &&'
                ' sudo docker build --pull --network=host . -t mlperf-nvidia:translation',
                should_log=True)
            _DownloadData(benchmark_spec.wmt_data_dir,
                          posixpath.join('/data', 'wmt'), vm)

        if 'minigo' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:minigo .',
                should_log=True)

        if 'mask' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:object_detection . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco2017_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if 'gnmt' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:rnn_translator . ',
                should_log=True)
            _DownloadData(benchmark_spec.gnmt_data_dir,
                          posixpath.join('/data', 'gnmt'), vm)

        if 'ssd' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:single_stage_detector . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco2017_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)
Example #17
0
def _RunModelOnVm(vm, model, benchmark_spec, args='', job_name=''):
  """Runs a TensorFlow benchmark on a single VM.

  Args:
    vm: VM to run on
    model: string, the name of model to run
    benchmark_spec: BenchmarkSpec object
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    a Sample containing the TensorFlow throughput or the process identification
    number from TensorFlow parameter server.
  """
  tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks'
  batch_size = _GetBatchSize(model)
  benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device
  benchmark_spec.device = FLAGS.tf_device
  benchmark_spec.data_format = FLAGS.tf_data_format
  if not cuda_toolkit.CheckNvidiaGpuExists(vm):
    benchmark_spec.local_parameter_device = CPU
    benchmark_spec.device = CPU
    benchmark_spec.data_format = NHWC
  tf_cnn_benchmark_cmd = (
      'python tf_cnn_benchmarks.py '
      '--local_parameter_device={local_parameter_device} '
      '--batch_size={batch_size} '
      '--model={model} '
      '--data_name={data_name} '
      '--variable_update={variable_update} '
      '--distortions={distortions} '
      '--device={device} '
      '--data_format={data_format} '
      '--forward_only={forward_only}'.format(
          local_parameter_device=benchmark_spec.local_parameter_device,
          batch_size=batch_size,
          model=model,
          data_name=benchmark_spec.data_name,
          variable_update=benchmark_spec.variable_update,
          distortions=benchmark_spec.distortions,
          device=benchmark_spec.device,
          data_format=benchmark_spec.data_format,
          forward_only=benchmark_spec.forward_only))
  if benchmark_spec.device == GPU:
    num_gpus = cuda_toolkit.QueryNumberOfGpus(vm)
    tf_cnn_benchmark_cmd = '{env} {cmd} --num_gpus={gpus}'.format(
        env=tensorflow.GetEnvironmentVars(vm),
        cmd=tf_cnn_benchmark_cmd,
        gpus=num_gpus)
  else:
    num_gpus = 0
  if args:
    tf_cnn_benchmark_cmd = '{cmd} --job_name={job} {args}'.format(
        cmd=tf_cnn_benchmark_cmd, job=job_name, args=args)
  run_command = 'cd {path} ; {cmd}'.format(path=tf_cnn_benchmark_dir,
                                           cmd=tf_cnn_benchmark_cmd)
  output, _ = vm.RobustRemoteCommand(run_command, should_log=True)
  if job_name == 'ps':
    return _ExtractTfParameterServerPid(output)
  else:
    return _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size,
                                  num_gpus)
def _GetTfCnnBenchmarkCommand(vm,
                              model,
                              batch_size,
                              benchmark_spec,
                              args='',
                              job_name=''):
    """Create the command used to run the tf_cnn_benchmarks script.

  The command is either formulated using flag values stored on the
  benchmark_spec, or is essentially provided outright through the
  benchmark_args flag.

  Args:
    vm: the VM to run on.
    model: name of the model to run.
    batch_size: batch size to use for training.
    benchmark_spec: the benchmark spec object.
    args: string, distributed arguments
    job_name: string, distributed job name

  Returns:
    A string that runs the tf_cnn_benchmarks.py script
    with the desired arguments.
  """
    num_gpus = (cuda_toolkit.QueryNumberOfGpus(vm)
                if cuda_toolkit.CheckNvidiaGpuExists(vm) else 0)

    if benchmark_spec.benchmark_args is not None:
        cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args
        # If the user didn't specify num_gpus in the benchmark_args string,
        # use all the GPUs on the system.
        if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus:
            cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd,
                                                       num_gpus=num_gpus)
        return cmd

    benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device
    benchmark_spec.device = FLAGS.tf_device
    benchmark_spec.data_format = FLAGS.tf_data_format
    if num_gpus == 0:
        benchmark_spec.local_parameter_device = CPU
        benchmark_spec.device = CPU
        benchmark_spec.data_format = NHWC

    cmd = ('python tf_cnn_benchmarks.py '
           '--local_parameter_device={local_parameter_device} '
           '--batch_size={batch_size} '
           '--model={model} '
           '--data_name={data_name} '
           '--variable_update={variable_update} '
           '--distortions={distortions} '
           '--device={device} '
           '--data_format={data_format} '
           '--forward_only={forward_only} '
           '--use_fp16={use_fp16}'.format(
               local_parameter_device=benchmark_spec.local_parameter_device,
               batch_size=batch_size,
               model=model,
               data_name=benchmark_spec.data_name,
               variable_update=benchmark_spec.variable_update,
               distortions=benchmark_spec.distortions,
               device=benchmark_spec.device,
               data_format=benchmark_spec.data_format,
               forward_only=benchmark_spec.forward_only,
               use_fp16=(benchmark_spec.precision == FP16)))
    if benchmark_spec.device == GPU:
        cmd = '{env} {cmd} --num_gpus={gpus}'.format(
            env=tensorflow.GetEnvironmentVars(vm), cmd=cmd, gpus=num_gpus)
    if args:
        cmd = '{cmd} --job_name={job} {args}'.format(cmd=cmd,
                                                     job=job_name,
                                                     args=args)
    return cmd
def Run(benchmark_spec):
    """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        resnet_benchmark_script = 'resnet_main.py'
        resnet_benchmark_cmd = (
            '{env_cmd} && '
            'cd tpu/models && '
            'export PYTHONPATH=$(pwd) &&'
            'cd official/resnet && '
            'python {script} '
            '--use_tpu={use_tpu} '
            '--data_dir={data_dir} '
            '--model_dir={model_dir} '
            '--resnet_depth={depth} '
            '--train_batch_size={train_batch_size} '
            '--eval_batch_size={eval_batch_size} '
            '--iterations_per_loop={iterations} '
            '--data_format={data_format} '
            '--precision={precision} '
            '--skip_host_call={skip_host_call} '
            '--num_train_images={num_train_images} '
            '--num_eval_images={num_eval_images}'.format(
                env_cmd=benchmark_spec.env_cmd,
                script=resnet_benchmark_script,
                use_tpu=bool(benchmark_spec.tpus),
                data_dir=benchmark_spec.data_dir,
                model_dir=benchmark_spec.model_dir,
                depth=benchmark_spec.depth,
                train_batch_size=benchmark_spec.train_batch_size,
                eval_batch_size=benchmark_spec.eval_batch_size,
                iterations=benchmark_spec.iterations,
                data_format=benchmark_spec.data_format,
                precision=benchmark_spec.precision,
                skip_host_call=benchmark_spec.skip_host_call,
                num_train_images=benchmark_spec.num_train_images,
                num_eval_images=benchmark_spec.num_eval_images))
    else:
        resnet_benchmark_script = 'imagenet_main.py'
        resnet_benchmark_cmd = ('{env_cmd} && '
                                'cd models && '
                                'export PYTHONPATH=$(pwd) && '
                                'cd official/r1/resnet && '
                                'python {script} '
                                '--data_dir=/data/imagenet '
                                '--model_dir={model_dir} '
                                '--resnet_size={resnet_size} '
                                '--batch_size={batch_size} '
                                '--data_format={data_format} '.format(
                                    env_cmd=benchmark_spec.env_cmd,
                                    script=resnet_benchmark_script,
                                    model_dir=benchmark_spec.model_dir,
                                    resnet_size=benchmark_spec.depth,
                                    batch_size=benchmark_spec.train_batch_size,
                                    data_format=benchmark_spec.data_format))
        precision = '{precision}'.format(precision=benchmark_spec.precision)
        if precision == 'bfloat16':
            resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
                cmd=resnet_benchmark_cmd)
        else:
            resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
                cmd=resnet_benchmark_cmd)

        if cuda_toolkit.CheckNvidiaGpuExists(vm):
            resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
                env=tensorflow.GetEnvironmentVars(vm),
                cmd=resnet_benchmark_cmd,
                num_gpus=cuda_toolkit.QueryNumberOfGpus(vm))

    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    elapsed_seconds = 0
    steps_per_eval = benchmark_spec.steps_per_eval
    train_steps = benchmark_spec.train_steps
    for step in range(steps_per_eval, train_steps + steps_per_eval,
                      steps_per_eval):
        step = min(step, train_steps)
        resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
            cmd=resnet_benchmark_cmd, step=step)

        if benchmark_spec.mode in ('train', 'train_and_eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['train'].GetName()
                num_cores = '--num_cores={}'.format(
                    benchmark_spec.tpu_groups['train'].GetNumShards())
                resnet_benchmark_train_cmd = (
                    '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                        cmd=resnet_benchmark_cmd_step,
                        tpu=tpu,
                        num_cores=num_cores))
            else:
                resnet_benchmark_train_cmd = (
                    '{cmd} --max_train_steps={max_train_steps} '
                    '--train_epochs={train_epochs} --noeval_only'.format(
                        cmd=resnet_benchmark_cmd,
                        train_epochs=benchmark_spec.epochs_per_eval,
                        max_train_steps=step))

            start = time.time()
            stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd,
                                                    should_log=True)
            elapsed_seconds += (time.time() - start)
            samples.extend(
                mnist_benchmark.MakeSamplesFromTrainOutput(
                    metadata, stdout + stderr, elapsed_seconds, step))

        if benchmark_spec.mode in ('train_and_eval', 'eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['eval'].GetName()
                num_cores = '--num_cores={}'.format(
                    benchmark_spec.tpu_groups['eval'].GetNumShards())
                resnet_benchmark_eval_cmd = (
                    '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                        cmd=resnet_benchmark_cmd_step,
                        tpu=tpu,
                        num_cores=num_cores))
            else:
                resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format(
                    cmd=resnet_benchmark_cmd))

            stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd,
                                                    should_log=True)
            samples.extend(
                MakeSamplesFromEvalOutput(metadata,
                                          stdout + stderr,
                                          elapsed_seconds,
                                          use_tpu=bool(benchmark_spec.tpus)))
    return samples