def Install(vm): """Installs TensorFlow on the VM.""" has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else FLAGS.tf_cpu_pip_package) if has_gpu: vm.Install('cuda_toolkit') vm.Install('cudnn') # TODO(ferneyhough): Move NCCL installation to its own package. # Currently this is dependent on CUDA 9 being installed. vm.RemoteCommand('wget %s' % NCCL_URL) vm.RemoteCommand('sudo dpkg -i %s' % NCCL_PACKAGE) vm.RemoteCommand('sudo apt install libnccl2=2.3.5-2+cuda9.0 ' 'libnccl-dev=2.3.5-2+cuda9.0') vm.Install('pip') vm.RemoteCommand('sudo pip install requests') vm.RemoteCommand('sudo pip install --upgrade absl-py') vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package, should_log=True) vm.RemoteCommand('sudo pip install --upgrade %s' % FLAGS.t2t_pip_package, should_log=True) vm.InstallPackages('git') vm.RemoteCommand('git clone https://github.com/tensorflow/benchmarks.git', should_log=True) vm.RemoteCommand('cd benchmarks && git checkout {}'.format( FLAGS.tf_cnn_benchmarks_branch))
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.5, the benchmake code of different hardware are different. if benchmark_spec.tpu_groups['train'].GetNumShards() > 8: code_path = 'cloud_v2.512/resnet-tpuv2-512/code/resnet/model' elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v2-8': code_path = 'cloud_v2.8/resnet-tpuv2-8/code/resnet/model' elif benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-8': code_path = 'cloud_v3.8/resnet-tpuv3-8/code/resnet/model' else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') cmd = 'bash run_helper.sh 2>&1 | tee output.txt' else: code_path = 'cloud_v100x8/code/resnet' cmd = ('sudo nvidia-docker build . -t foo && ' 'sudo nvidia-docker run -v $MLP_HOST_DATA_DIR:/data -v ' '$MLP_HOST_OUTPUT_DIR:/output -v /proc:/host_proc -t ' 'foo:latest run_helper_8xV100.sh 2>&1 | tee output.txt') mlperf_benchmark_cmd = ( 'export MLP_GCS_MODEL_DIR={model_dir} && ' 'export MLP_PATH_GCS_IMAGENET={data_dir} && ' 'export MLP_TPU_NAME={tpu_train} && ' 'export MLP_PATH_GCS_EUW_IMAGENET={data_dir} && ' 'export MLP_GCS_EUW_MODEL_DIR={model_dir} && ' 'export MLP_TPU_SIDECAR_NAME={tpu_eval} && ' 'export MLP_HOST_DATA_DIR=/data && ' 'export MLP_HOST_OUTPUT_DIR=`pwd`/output && ' 'export PYTHONPATH=$PYTHONPATH:$PWD/tpu/models && ' 'cd results/v0.5.0/google/{code_path} && ' 'sed -i "s/python /python3 /g" run_helper*.sh && ' 'mkdir -p $MLP_HOST_OUTPUT_DIR && ' '{cmd}'.format(model_dir=benchmark_spec.model_dir, data_dir=benchmark_spec.data_dir, tpu_train=(benchmark_spec.tpu_groups['train'].GetName() if benchmark_spec.tpus else ''), tpu_eval=(benchmark_spec.tpu_groups['eval'].GetName() if benchmark_spec.tpus else ''), code_path=code_path, cmd=cmd)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend(MakeSamplesFromOutput(metadata, stdout)) return samples
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_dir = 'tpu-demos/cloud_tpu/models/mnist' mnist_benchmark_cmd = ( 'python mnist.py --master={master} --train_file={train_file} ' '--use_tpu={use_tpu} ' '--train_steps={train_steps}'.format( master=benchmark_spec.master, train_file=benchmark_spec.train_file, use_tpu=benchmark_spec.use_tpu, train_steps=benchmark_spec.train_steps)) if benchmark_spec.model_dir: mnist_benchmark_cmd = '{cmd} --model_dir {model_dir}'.format( cmd=mnist_benchmark_cmd, model_dir=benchmark_spec.model_dir) if cuda_toolkit.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '%s %s' % (tensorflow.GetEnvironmentVars(vm), mnist_benchmark_cmd) run_command = 'cd %s && %s' % (mnist_benchmark_dir, mnist_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True) return _MakeSamplesFromOutput(benchmark_spec, stdout + stderr)
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_script = 'tpu/cloud_tpu/models/mnist/mnist.py' mnist_benchmark_cmd = ('python {script} ' '--master={master} ' '--train_file={train_file} ' '--use_tpu={use_tpu} ' '--train_steps={train_steps} ' '--iterations={iterations} ' '--model_dir={model_dir}'.format( script=mnist_benchmark_script, master=benchmark_spec.master, train_file=benchmark_spec.train_file, use_tpu=benchmark_spec.use_tpu, train_steps=benchmark_spec.train_steps, iterations=benchmark_spec.iterations, model_dir=benchmark_spec.model_dir)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_cmd, should_log=True) return MakeSamplesFromOutput(_CreateMetadataDict(benchmark_spec), stdout + stderr)
def Prepare(benchmark_spec): """Install and set up ResNet on the target vm. Args: benchmark_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) mnist_benchmark.Prepare(benchmark_spec) _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm.Install('pyyaml') # To correctly install the requests lib, otherwise the experiment won't run vm.RemoteCommand('sudo pip uninstall -y requests') vm.RemoteCommand('sudo pip install requests') if not benchmark_spec.tpus: local_data_path = posixpath.join('/data', 'imagenet') vm.RemoteCommand('sudo mkdir -p {data_path} && ' 'sudo chmod a+w {data_path} && ' 'gsutil -m cp -r {data_dir}/* {data_path}'.format( data_dir=benchmark_spec.data_dir, data_path=local_data_path))
def _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec model: model which was run batch_size: batch sized used num_gpus: number of GPUs used Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() if cuda_toolkit.CheckNvidiaGpuExists(vm): metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['num_gpus'] = num_gpus metadata['model'] = model metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only metadata['data_name'] = benchmark_spec.data_name metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device metadata['data_format'] = benchmark_spec.data_format metadata['distortions'] = benchmark_spec.distortions metadata['benchmarks_commit_hash'] = benchmark_spec.benchmarks_commit_hash metadata['tensorflow_version'] = benchmark_spec.tensorflow_version metadata['tensorflow_cpu_pip_package'] = ( benchmark_spec.tensorflow_cpu_pip_package) metadata['tensorflow_gpu_pip_package'] = ( benchmark_spec.tensorflow_gpu_pip_package) metadata['distributed'] = benchmark_spec.distributed return metadata
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mnist_benchmark_script = 'mnist_tpu.py' mnist_benchmark_cmd = ( 'cd models/official/mnist && ' 'python {script} ' '--data_dir={data_dir} ' '--iterations={iterations} ' '--model_dir={model_dir} ' '--batch_size={batch_size}'.format( script=mnist_benchmark_script, data_dir=benchmark_spec.data_dir, iterations=benchmark_spec.iterations, model_dir=benchmark_spec.model_dir, batch_size=benchmark_spec.batch_size)) if cuda_toolkit.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd) samples = [] metadata = CreateMetadataDict(benchmark_spec) if benchmark_spec.train_steps: if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_shards = '' mnist_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} ' '{num_shards} --noenable_predict'.format( cmd=mnist_benchmark_cmd, tpu=tpu, use_tpu=bool(benchmark_spec.tpus), train_steps=benchmark_spec.train_steps, num_shards=num_shards)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd, should_log=True) elapsed_seconds = (time.time() - start) samples.extend(MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, benchmark_spec.train_steps)) if benchmark_spec.eval_steps: mnist_benchmark_eval_cmd = ( '{cmd} --tpu="" --use_tpu=False --eval_steps={eval_steps}'.format( cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps)) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd, should_log=True) samples.extend(MakeSamplesFromEvalOutput(metadata, stdout + stderr, elapsed_seconds)) return samples
def Install(vm): """Installs TensorFlow on the VM.""" has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else FLAGS.tf_cpu_pip_package) if has_gpu: vm.Install('cuda_toolkit') vm.Install('cudnn') vm.Install('pip') vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package, should_log=True)
def Prepare(benchmark_spec): """Install and set up TensorFlow on the target vm. Args: benchmark_spec: The benchmark specification """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms vm_util.RunThreaded(_PrepareVm, vms) benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(vms[0]) if cuda_toolkit.CheckNvidiaGpuExists(vms[0]): benchmark_spec.gpu_type = cuda_toolkit.GetGpuType(vms[0])
def _CreateMetadataDict(benchmark_spec, model, batch_size): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec model: model which was run batch_size: batch sized used Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = {} if cuda_toolkit.CheckNvidiaGpuExists(vm): metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['command_line'] = benchmark_spec.tf_cnn_benchmark_cmd metadata['cnn_benchmarks_branch'] = benchmark_spec.cnn_benchmarks_branch metadata['tensorflow_version'] = benchmark_spec.tensorflow_version metadata['tensorflow_cpu_pip_package'] = ( benchmark_spec.tensorflow_cpu_pip_package) metadata['tensorflow_gpu_pip_package'] = ( benchmark_spec.tensorflow_gpu_pip_package) # If we ran a custom command-line through the benchmark_args flag, # add the metadata from that command and return. We don't need anymore # metadata from this function as it is likely invalid. if getattr(benchmark_spec, 'benchmark_args', None): metadata.update( _GetMetadataFromBenchmarkArgs(benchmark_spec.benchmark_args)) return metadata metadata['model'] = model metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only metadata['data_name'] = benchmark_spec.data_name metadata['data_dir'] = benchmark_spec.data_dir metadata['use_local_data'] = benchmark_spec.use_local_data metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device metadata['data_format'] = benchmark_spec.data_format metadata['distortions'] = benchmark_spec.distortions metadata['distributed'] = benchmark_spec.distributed metadata['precision'] = benchmark_spec.precision metadata['num_gpus'] = benchmark_spec.num_gpus return metadata
def Install(vm): """Installs TensorFlow on the VM.""" has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else FLAGS.tf_cpu_pip_package) commit_hash = FLAGS.tf_benchmarks_commit_hash if has_gpu: vm.Install('cuda_toolkit') vm.Install('cudnn') vm.Install('pip') vm.RemoteCommand('sudo pip install --upgrade absl-py') vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package, should_log=True) vm.RemoteCommand('git clone https://github.com/tensorflow/benchmarks.git', should_log=True) vm.RemoteCommand('cd benchmarks && git checkout {}'.format(commit_hash))
def GetEnvironmentVars(vm): """Return a string containing TensorFlow-related environment variables. Args: vm: vm to get environment varibles Returns: string of environment variables """ if not cuda_toolkit.CheckNvidiaGpuExists(vm): return '' output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True) long_bit = output.strip() lib_name = 'lib' if long_bit == '32' else 'lib64' return ' '.join([ 'PATH=%s${PATH:+:${PATH}}' % posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'), 'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir, 'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' % posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name), ])
def GetEnvironmentVars(vm): """Return a string containing TensorFlow-related environment variables. Args: vm: vm to get environment varibles Returns: string of environment variables """ env_vars = [] if cuda_toolkit.CheckNvidiaGpuExists(vm): output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True) long_bit = output.strip() lib_name = 'lib' if long_bit == '32' else 'lib64' env_vars.extend([ 'PATH=%s${PATH:+:${PATH}}' % posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'), 'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir, 'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' % posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name)]) if FLAGS.aws_s3_region: env_vars.append('AWS_REGION={}'.format(FLAGS.aws_s3_region)) return ' '.join(env_vars)
def Install(vm): """Installs TensorFlow on the VM.""" has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else FLAGS.tf_cpu_pip_package) if has_gpu: vm.Install('cuda_toolkit') vm.Install('cudnn') # TODO(ferneyhough): Move NCCL installation to its own package. # Currently this is dependent on CUDA 9 being installed. vm.RemoteCommand('wget %s' % NCCL_URL) vm.RemoteCommand('sudo dpkg -i %s' % NCCL_PACKAGE) vm.RemoteCommand('sudo apt install libnccl2=2.3.5-2+cuda9.0 ' 'libnccl-dev=2.3.5-2+cuda9.0') vm.Install('pip') vm.RemoteCommand('sudo pip install requests') vm.RemoteCommand('sudo pip install --upgrade absl-py') vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package, should_log=True) vm.RemoteCommand( 'sudo pip install --upgrade %s' % FLAGS.t2t_pip_package, should_log=True) vm.InstallPackages('git') _, _, retcode = vm.RemoteHostCommandWithReturnCode( 'test -d benchmarks', ignore_failure=True, suppress_warning=True) if retcode != 0: vm.RemoteCommand( 'git clone https://github.com/tensorflow/benchmarks.git', should_log=True) vm.RemoteCommand( 'cd benchmarks && git checkout {}'.format(FLAGS.tf_cnn_benchmarks_branch) ) if FLAGS.cloud == 'AWS' and FLAGS.tf_data_dir and ( not FLAGS.tf_use_local_data): vm.Install('aws_credentials')
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ( 'cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time1.sh'.format(code_path=code_path, model=model, run_path=run_path)) if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT=gs://download.tensorflow.org/models/mlperf/v0.5.0/resnet34_ssd_checkpoint' ' && {cmd}'.format(cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: if 'resnet' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' 'training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&' ' chmod 755 ./run1.sub && sudo DATADIR=/data/imagenet ' 'LOGDIR=/tmp/resnet PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'transformer' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' 'training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > ./run1.sub &&' ' chmod 755 ./run1.sub && sudo DATADIR=/data/wmt/utf8 ' 'LOGDIR=/tmp/transformer PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ' ) if 'minigo' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow' ' && sed \'s/SYSLOGGING=1/SYSLOGGING=0/g\' ./run.sub > run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/minigo ' 'CONT=mlperf-nvidia:minigo DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'mask' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/mask DATADIR=/data PULL=0 ' 'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'gnmt' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/gnmt DATADIR=/data/gnmt ' 'PULL=0 DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'cd ' '$HOME/training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch' ' && sed "s/SYSLOGGING=1/SYSLOGGING=0/g" ./run.sub > ./run1.sub && ' 'chmod 755 ./run1.sub && sudo LOGDIR=/tmp/ssd DATADIR=/data PULL=0 ' 'DGXSYSTEM=DGX1 NEXP=1 ./run1.sub ') if cuda_toolkit.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend( MakeSamplesFromOutput(metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def Prepare(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) if vm is None: vm = benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) vm.RemoteCommand( 'if [ ! -d "$HOME/training_results_v0.6" ]; then ' ' git clone https://github.com/mlperf/training_results_v0.6.git ; ' 'fi', should_log=True) vm.InstallPackages('python3-pip') if benchmark_spec.tpus: if vm == benchmark_spec.vms[0]: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service bucket = 'pkb{}'.format(FLAGS.run_uri) benchmark_spec.bucket = bucket benchmark_spec.model_dir = 'gs://{}'.format(bucket) location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.ChmodBucket(benchmark_spec.gcp_service_account, 'W', bucket) # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') if ('mask' in benchmark_spec.benchmark or 'ssd' in benchmark_spec.benchmark): # TODO(b/141876878): coco whl package for python 3.5 vm.RemoteCommand( 'cd /tmp && ' 'wget https://storage.cloud.google.com/mlperf_artifcats/v0.6_training/coco-1.1-cp36-cp36m-linux_x86_64.whl' ) vm.RemoteCommand('cd {path} && ' 'sed "s/--progress-bar off/ /g" ./setup.sh | ' 'sed "s/pip /pip3 /g" > ./setup1.sh && ' 'chmod 755 ./setup1.sh && ' './setup1.sh'.format(path=run_path)) else: vm.RemoteCommand( 'cd {path} && ' 'sed "s/--progress-bar off/ /g" ./setup.sh > ./setup1.sh && ' 'chmod 755 ./setup1.sh && ' './setup1.sh'.format(path=run_path)) if 'mask' not in benchmark_spec.benchmark: vm.RemoteCommand( 'pip3 uninstall -y tf-estimator-nightly && ' 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801') vm.RemoteCommand( r'cd {path} && ' r'sed "s/--model_dir=.*/--model_dir=gs:\/\/{bucket} \\\/g" run_and_time.sh | ' r'sed "s/--tpu=.*/--tpu={tpu} \\\/g" | ' r'sed "s/--output_dir=.*/--output_dir=gs:\/\/{bucket} \\\/g" | ' r'sed "s/--cloud_tpu_name=.*/--cloud_tpu_name={tpu} \\\/g" | ' r'sed "s/--out_dir=.*/--out_dir=gs:\/\/{bucket} \\\/g" | ' r'sed "s/--tpu_name=.*/--tpu_name={tpu} \\\/g" > run_and_time1.sh && ' r'chmod 755 run_and_time1.sh '.format( path=run_path, bucket=bucket, tpu=benchmark_spec.tpu_groups['train'].GetName())) if 'gnmt' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd {code_path}/{model} && ' 'cp metric.py metric0.py && ' 'sed "s/ sacrebleu -t/ python3 -m sacrebleu -t/g" metric0.py > metric.py' .format(code_path=code_path, model=model)) else: benchmark_spec.model_dir = '/tmp' has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nvidia_docker') vm.RemoteCommand( 'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi') if 'resnet' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet &&' ' sudo docker build --pull --network=host . -t mlperf-nvidia:image_classification', should_log=True) _DownloadData(benchmark_spec.imagenet_data_dir, posixpath.join('/data', 'imagenet'), vm) if 'transformer' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch &&' ' sudo docker build --pull --network=host . -t mlperf-nvidia:translation', should_log=True) _DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm) if 'minigo' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:minigo .', should_log=True) if 'mask' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:object_detection . ', should_log=True) _DownloadData(benchmark_spec.coco2017_data_dir, posixpath.join('/data', 'coco2017'), vm) if 'gnmt' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:rnn_translator . ', should_log=True) _DownloadData(benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm) if 'ssd' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:single_stage_detector . ', should_log=True) _DownloadData(benchmark_spec.coco2017_data_dir, posixpath.join('/data', 'coco2017'), vm)
def _RunModelOnVm(vm, model, benchmark_spec, args='', job_name=''): """Runs a TensorFlow benchmark on a single VM. Args: vm: VM to run on model: string, the name of model to run benchmark_spec: BenchmarkSpec object args: string, distributed arguments job_name: string, distributed job name Returns: a Sample containing the TensorFlow throughput or the process identification number from TensorFlow parameter server. """ tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks' batch_size = _GetBatchSize(model) benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if not cuda_toolkit.CheckNvidiaGpuExists(vm): benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC tf_cnn_benchmark_cmd = ( 'python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only}'.format( local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only)) if benchmark_spec.device == GPU: num_gpus = cuda_toolkit.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '{env} {cmd} --num_gpus={gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=tf_cnn_benchmark_cmd, gpus=num_gpus) else: num_gpus = 0 if args: tf_cnn_benchmark_cmd = '{cmd} --job_name={job} {args}'.format( cmd=tf_cnn_benchmark_cmd, job=job_name, args=args) run_command = 'cd {path} ; {cmd}'.format(path=tf_cnn_benchmark_dir, cmd=tf_cnn_benchmark_cmd) output, _ = vm.RobustRemoteCommand(run_command, should_log=True) if job_name == 'ps': return _ExtractTfParameterServerPid(output) else: return _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size, num_gpus)
def _GetTfCnnBenchmarkCommand(vm, model, batch_size, benchmark_spec, args='', job_name=''): """Create the command used to run the tf_cnn_benchmarks script. The command is either formulated using flag values stored on the benchmark_spec, or is essentially provided outright through the benchmark_args flag. Args: vm: the VM to run on. model: name of the model to run. batch_size: batch size to use for training. benchmark_spec: the benchmark spec object. args: string, distributed arguments job_name: string, distributed job name Returns: A string that runs the tf_cnn_benchmarks.py script with the desired arguments. """ num_gpus = (cuda_toolkit.QueryNumberOfGpus(vm) if cuda_toolkit.CheckNvidiaGpuExists(vm) else 0) if benchmark_spec.benchmark_args is not None: cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args # If the user didn't specify num_gpus in the benchmark_args string, # use all the GPUs on the system. if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus: cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd, num_gpus=num_gpus) return cmd benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if num_gpus == 0: benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC cmd = ('python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--use_fp16={use_fp16}'.format( local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only, use_fp16=(benchmark_spec.precision == FP16))) if benchmark_spec.device == GPU: cmd = '{env} {cmd} --num_gpus={gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=cmd, gpus=num_gpus) if args: cmd = '{cmd} --job_name={job} {args}'.format(cmd=cmd, job=job_name, args=args) return cmd
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if cuda_toolkit.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=cuda_toolkit.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend( mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput(metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples