def Install(vm): """Installs TensorFlow on the VM.""" has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm) tf_pip_package = (FLAGS.tf_gpu_pip_package if has_gpu else FLAGS.tf_cpu_pip_package) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nccl') vm.Install('cudnn') vm.Install('pip') vm.RemoteCommand('sudo pip install requests') vm.RemoteCommand('sudo pip install --upgrade absl-py') vm.RemoteCommand('sudo pip install --upgrade %s' % tf_pip_package, should_log=True) vm.RemoteCommand('sudo pip install --upgrade %s' % FLAGS.t2t_pip_package, should_log=True) vm.InstallPackages('git') _, _, retcode = vm.RemoteHostCommandWithReturnCode('test -d benchmarks', ignore_failure=True, suppress_warning=True) if retcode != 0: vm.RemoteCommand( 'git clone https://github.com/tensorflow/benchmarks.git', should_log=True) vm.RemoteCommand('cd benchmarks && git checkout {}'.format( FLAGS.tf_cnn_benchmarks_branch)) if FLAGS.cloud == 'AWS' and FLAGS.tf_data_dir and ( not FLAGS.tf_use_local_data): vm.Install('aws_credentials')
def Prepare(benchmark_spec): """Install and set up ResNet on the target vm. Args: benchmark_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) mnist_benchmark.Prepare(benchmark_spec) _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm.Install('pyyaml') # To correctly install the requests lib, otherwise the experiment won't run vm.RemoteCommand('sudo pip uninstall -y requests') vm.RemoteCommand('sudo pip install requests') if not benchmark_spec.tpus: local_data_path = posixpath.join('/data', 'imagenet') vm.RemoteCommand('sudo mkdir -p {data_path} && ' 'sudo chmod a+w {data_path} && ' 'gsutil -m cp -r {data_dir}/* {data_path}'.format( data_dir=benchmark_spec.data_dir, data_path=local_data_path))
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run CUDA memcopy on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ if not nvidia_driver.CheckNvidiaGpuExists(vm): return [] if not nvidia_driver.CheckNvidiaSmiExists(vm): return [] global_metadata = _MetadataFromFlags() global_metadata.update(cuda_toolkit.GetMetadata(vm)) global_cmd = [ BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}', f'--mode={_MODE.value}' ] if _HTOD.value: global_cmd.append('--htod') if _DTOH.value: global_cmd.append('--dtoh') if _DTOD.value: global_cmd.append('--dtod') if _WC.value: global_cmd.append('--wc') num_gpus = nvidia_driver.QueryNumberOfGpus(vm) devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else []) samples = [] for device in devices: cmd = ' '.join(global_cmd + [f'--device={device}']) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( cmd, ignore_failure=True) if exit_code: logging.warning('Error with getting GPU stats: %s', stderr) continue results = regex_util.ExtractAllMatches( r'bandwidthTest-(\S+), ' r'Bandwidth = ([\d\.]+) (\S+), ' r'Time = ([\d\.]+) s, ' r'Size = (\d+) bytes, ' r'NumDevsUsed = (\d+)', stdout) for metric, bandwidth, unit, time, size, num_devs_used in results: metadata = { 'time': float(time), 'size': int(size), 'NumDevsUsed': num_devs_used, 'device': device, 'command': cmd, } metadata.update(global_metadata) samples.append( sample.Sample(metric, float(bandwidth), unit, metadata)) return samples
def _PrepareVm(benchmark_spec, rank): vm = benchmark_spec.vms[rank] vm.InstallPackages('python3-pip') if nvidia_driver.CheckNvidiaGpuExists(vm): vm.Install('cuda_toolkit') vm.AuthenticateVm() vm.Install('openmpi') vm.Install('nccl') _DownloadData(benchmark_spec, rank)
def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None: """Install and set up MLPerf Inference on the target vm. Args: bm_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = bm_spec.vms[0] repository = f'inference_results_{MLPERF_INFERENCE_VERSION}' vm.RemoteCommand( f'git clone https://github.com/mlcommons/{repository}.git', should_log=True) makefile = f'{repository}/closed/NVIDIA/Makefile' vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile) requirements = f'{repository}/closed/NVIDIA/docker/requirements.1' vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52', 'opencv-python-headless==4.5.3.56', requirements) if nvidia_driver.CheckNvidiaGpuExists(vm): vm.Install('cuda_toolkit') vm.Install('nvidia_driver') vm.Install('nvidia_docker') benchmark = FLAGS.mlperf_benchmark bm_spec.env_cmd = ('export MLPERF_SCRATCH_PATH=/scratch && ' f'cd {repository}/closed/NVIDIA') vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make build_docker NO_BUILD=1 && ' 'make docker_add_user && ' 'make launch_docker DOCKER_COMMAND="echo $MLPERF_SCRATCH_PATH" && ' 'make launch_docker DOCKER_COMMAND="ls -al $MLPERF_SCRATCH_PATH" && ' 'make launch_docker DOCKER_COMMAND="make clean" && ' 'make launch_docker DOCKER_COMMAND="make link_dirs" && ' 'make launch_docker DOCKER_COMMAND="ls -al build/"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_model BENCHMARKS={benchmark}" && ' 'make launch_docker DOCKER_COMMAND=' f'"make preprocess_data BENCHMARKS={benchmark}" && ' f'make launch_docker DOCKER_COMMAND="make build"', should_log=True)
def Install(vm): """Installs PyTorch on the VM.""" vm.Install('pip3') toolkit = 'cpu' if nvidia_driver.CheckNvidiaGpuExists(vm): # Translates --cuda_toolkit_version=10.2 to "cu102" for the toolkit to # install toolkit = f'cu{"".join(FLAGS.cuda_toolkit_version.split("."))}' vm.RemoteCommand(f'{FLAGS.torch_env} python3 -m pip install ' f'torch=={FLAGS.torch_version}+{toolkit} ' f'torchvision=={FLAGS.torchvision_version}+{toolkit} ' f'torchaudio=={FLAGS.torchaudio_version} ' f'-f {_PYTORCH_WHL}')
def Prepare(benchmark_spec): """Install and set up TensorFlow on the target vm. Args: benchmark_spec: The benchmark specification """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms vm_util.RunThreaded(_PrepareVm, vms) benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(vms[0]) if nvidia_driver.CheckNvidiaGpuExists(vms[0]): benchmark_spec.gpu_type = nvidia_driver.GetGpuType(vms[0])
def _CreateMetadataDict(benchmark_spec, model, batch_size): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec model: model which was run batch_size: batch sized used Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = {} if nvidia_driver.CheckNvidiaGpuExists(vm): metadata.update(nvidia_driver.GetMetadata(vm)) metadata['command_line'] = benchmark_spec.tf_cnn_benchmark_cmd metadata['cnn_benchmarks_branch'] = benchmark_spec.cnn_benchmarks_branch metadata['tensorflow_version'] = benchmark_spec.tensorflow_version metadata['tensorflow_cpu_pip_package'] = ( benchmark_spec.tensorflow_cpu_pip_package) metadata['tensorflow_gpu_pip_package'] = ( benchmark_spec.tensorflow_gpu_pip_package) # If we ran a custom command-line through the benchmark_args flag, # add the metadata from that command and return. We don't need anymore # metadata from this function as it is likely invalid. if getattr(benchmark_spec, 'benchmark_args', None): metadata.update( _GetMetadataFromBenchmarkArgs(benchmark_spec.benchmark_args)) return metadata metadata['model'] = model metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only metadata['data_name'] = benchmark_spec.data_name metadata['data_dir'] = benchmark_spec.data_dir metadata['use_local_data'] = benchmark_spec.use_local_data metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device metadata['data_format'] = benchmark_spec.data_format metadata['distortions'] = benchmark_spec.distortions metadata['distributed'] = benchmark_spec.distributed metadata['precision'] = benchmark_spec.precision metadata['num_gpus'] = benchmark_spec.num_gpus return metadata
def Prepare(benchmark_spec): """Install and set up RoBERTa mmlm on the target vm.. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms benchmark_spec.always_call_cleanup = True list_params = [((benchmark_spec, rank), {}) for rank in range(benchmark_spec.num_vms)] vm_util.RunThreaded(_PrepareVm, list_params) master = vms[0] if nvidia_driver.CheckNvidiaGpuExists(master): gpus_per_vm = nvidia_driver.QueryNumberOfGpus(master) hpc_util.CreateMachineFile(vms, lambda _: gpus_per_vm, HOSTFILE)
def PrepareBenchmark(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = vm or benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) vm.RemoteCommand( f'if [ ! -d "$HOME/training_results_{MLPERF_VERSION}" ]; then ' f' git clone https://github.com/mlcommons/training_results_{MLPERF_VERSION}.git ; ' 'fi', should_log=True) vm.Install('pip3') if not HYPERTHREADS.value: if BERT in benchmark_spec.benchmark: vm_util.ReplaceText( vm, "'bind_pyt'", "'bind_pyt' '--no_hyperthreads'", f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/' 'implementations/pytorch/run_with_docker.sh') elif MASK in benchmark_spec.benchmark: vm_util.ReplaceText( vm, "'bind_launch'", "'bind_launch' '--no_hyperthreads'", f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/' 'implementations/pytorch/run_and_time.sh') elif RESNET in benchmark_spec.benchmark: vm_util.ReplaceText( vm, '--cpu=exclusive', '--cpu=exclusive,nosmt', f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/' 'implementations/mxnet/run_and_time.sh')
def GetEnvironmentVars(vm): """Return a string containing TensorFlow-related environment variables. Args: vm: vm to get environment varibles Returns: string of environment variables """ env_vars = [] if nvidia_driver.CheckNvidiaGpuExists(vm): output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True) long_bit = output.strip() lib_name = 'lib' if long_bit == '32' else 'lib64' env_vars.extend([ 'PATH=%s${PATH:+:${PATH}}' % posixpath.join(cuda_toolkit.CUDA_HOME, 'bin'), 'CUDA_HOME=%s' % cuda_toolkit.CUDA_HOME, 'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' % posixpath.join(cuda_toolkit.CUDA_HOME, lib_name) ]) if FLAGS.aws_s3_region: env_vars.append('AWS_REGION={}'.format(FLAGS.aws_s3_region)) return ' '.join(env_vars)
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms master_vm = vms[0] benchmark = benchmark_spec.benchmark env_params = {} env_params['SLURM_JOB_ID'] = r'{uri}'.format(uri=FLAGS.run_uri) env_params['PULL'] = 0 env_params['DGXSYSTEM'] = DGXSYSTEM env_params['NEXP'] = 1 env_params['LOGDIR'] = posixpath.join(vm_util.VM_TMP_DIR, benchmark) script_path = ('$HOME/training_results_{version}/NVIDIA/benchmarks/{model}' r'/implementations/{framework}'.format( version=mlperf_benchmark.MLPERF_VERSION, model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark, framework='mxnet' if mlperf_benchmark.RESNET in benchmark else 'pytorch')) benchmark_env_params = { mlperf_benchmark.TRANSFORMER: { 'CONT': r'"mlperf-nvidia:translation"', 'DATADIR': r'/data/wmt/utf8' }, mlperf_benchmark.SSD: { 'CONT': r'"mlperf-nvidia:single_stage_detector"', 'DATADIR': '/data' }, mlperf_benchmark.GNMT: { 'CONT': r'"mlperf-nvidia:rnn_translator"', 'DATADIR': r'/data/gnmt' }, mlperf_benchmark.MASK: {}, mlperf_benchmark.RESNET: {}, mlperf_benchmark.BERT: {}, } env_params.update(benchmark_env_params.get(benchmark, {})) if mlperf_benchmark.RESNET in benchmark: env_params['SLURM_JOB_NUM_NODES'] = benchmark_spec.num_vms env = r'' if nvidia_driver.CheckNvidiaGpuExists(master_vm): env = tensorflow.GetEnvironmentVars(master_vm) cmd = (f'cd {script_path} && ' f'{env} {_DictToString(env_params)} ' f'{FLAGS.nccl_mpi} ' '--allow-run-as-root ' '-hostfile $HOME/HOSTFILE ' '--mca pml ^cm ' '--mca btl tcp,self ' '--mca btl_tcp_if_exclude docker0,lo ' '--bind-to none ' '-N 1 ' './run_with_docker1.sh') if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler or FLAGS.mlperf_keep_nccl_log): cmd += (r' && cp /tmp/pkb/cmd* {logdir}'.format( logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark))) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = master_vm.RobustRemoteCommand(cmd, should_log=True) if mlperf_benchmark.NONE in FLAGS.mlperf_profiler: samples.extend(MakeSamplesFromOutput(metadata, stdout, model=benchmark)) if (mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler or FLAGS.mlperf_keep_nccl_log): master_vm.RemoteCommand( r'mkdir -p /data/aggregated/{model}'.format(model=benchmark)) master_vm.RemoteCommand( r'mpirun -hostfile $HOME/{hostfile} -N 1 scp -r {logdir} ' r'{master_ip}:/data/aggregated/'.format( hostfile=HOSTFILE, logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark), master_ip=master_vm.internal_ip)) return samples
def Prepare(bm_spec: benchmark_spec.BenchmarkSpec) -> None: """Installs and sets up MLPerf Inference on the target vm. Args: bm_spec: The benchmark specification Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = bm_spec.vms[0] repository = f'inference_results_{MLPERF_INFERENCE_VERSION}' vm.RemoteCommand(f'git clone https://github.com/mlcommons/{repository}.git') makefile = f'{repository}/closed/NVIDIA/Makefile' vm_util.ReplaceText(vm, 'shell uname -p', 'shell uname -m', makefile) requirements1 = f'{repository}/closed/NVIDIA/docker/requirements.1' vm_util.ReplaceText(vm, 'opencv-python-headless==4.5.2.52', 'opencv-python-headless==4.5.3.56', requirements1) requirements2 = f'{repository}/closed/NVIDIA/docker/requirements.2' benchmark = FLAGS.mlperf_benchmark if _SERVER_TARGET_QPS.value: config = f'{repository}/closed/NVIDIA/configs/{benchmark}/Server/__init__.py' vm_util.ReplaceText(vm, 'server_target_qps = .*', f'server_target_qps = {_SERVER_TARGET_QPS.value}', config) for requirements in (requirements1, requirements2): vm_util.ReplaceText(vm, 'git:', 'https:', requirements) if nvidia_driver.CheckNvidiaGpuExists(vm): vm.Install('cuda_toolkit') vm.Install('nvidia_driver') vm.Install('nvidia_docker') bm_spec.env_cmd = (f'export MLPERF_SCRATCH_PATH={_MLPERF_SCRATCH_PATH} && ' f'cd {repository}/closed/NVIDIA') docker.AddUser(vm) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make build_docker NO_BUILD=1 && ' 'make docker_add_user && ' 'make launch_docker DOCKER_COMMAND="make clean" && ' 'make launch_docker DOCKER_COMMAND="make link_dirs"', should_log=True) if benchmark == mlperf_benchmark.DLRM: # Download data data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', _DLRM_DATA_MODULE) vm.DownloadPreprovisionedData(data_dir, _DLRM_DATA_MODULE, _DLRM_DATA) vm.RemoteCommand(f'cd {data_dir} && gzip -d {_DLRM_DATA}') # Download model model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark) vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_MODEL) vm.RemoteCommand(f'cd {model_dir} && ' f'tar -zxvf {_DLRM_MODEL} && ' f'rm -f {_DLRM_MODEL}') vm.DownloadPreprovisionedData(model_dir, benchmark, _DLRM_ROW_FREQ) # Preprocess Data preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'preprocessed_data', _DLRM_DATA_MODULE) vm.DownloadPreprovisionedData(preprocessed_data_dir, _DLRM_DATA_MODULE, _DLRM_PREPROCESSED_DATA) vm.RemoteCommand(f'cd {preprocessed_data_dir} && ' f'tar -zxvf {_DLRM_PREPROCESSED_DATA} && ' f'rm -f {_DLRM_PREPROCESSED_DATA}') elif benchmark == mlperf_benchmark.BERT: # Download data data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'data', 'squad') vm.DownloadPreprovisionedData(data_dir, benchmark, 'dev-v1.1.json') # Download model model_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'models', benchmark) vm.DownloadPreprovisionedData(model_dir, benchmark, 'bert_large_v1_1.onnx') vm.DownloadPreprovisionedData(model_dir, benchmark, 'bert_large_v1_1_fake_quant.onnx') vm.DownloadPreprovisionedData(model_dir, benchmark, 'vocab.txt') # Preprocess Data preprocessed_data_dir = posixpath.join(_MLPERF_SCRATCH_PATH, 'preprocessed_data', 'squad_tokenized') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'input_ids.npy') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'input_mask.npy') vm.DownloadPreprovisionedData(preprocessed_data_dir, benchmark, 'segment_ids.npy') else: vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make download_model BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' f'"make preprocess_data BENCHMARKS={benchmark}"', should_log=True) vm.RobustRemoteCommand( f'{bm_spec.env_cmd} && ' 'make launch_docker DOCKER_COMMAND=' '"make build" && ' 'make launch_docker DOCKER_COMMAND=' '"make generate_engines RUN_ARGS=\'' f'--benchmarks={FLAGS.mlperf_benchmark} ' f'--scenarios={_SCENARIOS.value}\'"', should_log=True)
def _GetTfCnnBenchmarkCommand(vm, model, batch_size, benchmark_spec, args='', job_name=''): """Create the command used to run the tf_cnn_benchmarks script. The command is either formulated using flag values stored on the benchmark_spec, or is essentially provided outright through the benchmark_args flag. Args: vm: the VM to run on. model: name of the model to run. batch_size: batch size to use for training. benchmark_spec: the benchmark spec object. args: string, distributed arguments job_name: string, distributed job name Returns: A string that runs the tf_cnn_benchmarks.py script with the desired arguments. """ num_gpus = (nvidia_driver.QueryNumberOfGpus(vm) if nvidia_driver.CheckNvidiaGpuExists(vm) else 0) benchmark_spec.num_gpus = num_gpus if benchmark_spec.benchmark_args is not None: cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args # If the user didn't specify num_gpus in the benchmark_args string, # use all the GPUs on the system. if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus: cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd, num_gpus=num_gpus) return cmd benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if num_gpus == 0: benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC cmd = ('{env_vars} python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '{data} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--use_fp16={use_fp16} ' '{num_gpus} ' '{job_name}'.format( env_vars=tensorflow.GetEnvironmentVars(vm), local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data=('--data_dir={}'.format(benchmark_spec.data_dir) if benchmark_spec.data_dir else ''), data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only, use_fp16=(benchmark_spec.precision == FP16), num_gpus='--num_gpus={}'.format(num_gpus) if num_gpus else '', job_name='--job_name={0} {1}'.format(job_name, args) if args else '')) return cmd
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf 1.0, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) code_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) if MASK in benchmark_spec.benchmark: model = 'mask_rcnn' elif GNMT in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ( 'cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time.sh'.format(code_path=code_path, model=model, run_path=run_path)) if SSD in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT={checkpoint}' ' && {cmd}'.format( checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: run_sub_paths = { RESNET: 'resnet/implementations/mxnet', TRANSFORMER: 'transformer/implementations/pytorch', MINIGO: 'minigo/implementations/tensorflow', MASK: 'maskrcnn/implementations/pytorch', GNMT: 'gnmt/implementations/pytorch', SSD: 'ssd/implementations/pytorch', BERT: 'bert/implementations/pytorch', } benchmark_path = f'$HOME/training_results_{MLPERF_VERSION}/NVIDIA/benchmarks' run_path = posixpath.join(benchmark_path, run_sub_paths[benchmark_spec.benchmark]) env = { 'DGXSYSTEM': DGXSYSTEM, 'NEXP': 1, 'PULL': 0, 'LOGDIR': f'/tmp/{benchmark_spec.benchmark}', } envs = { RESNET: {}, TRANSFORMER: { 'DATADIR': '/data/wmt/utf8' }, MINIGO: { 'CONT': 'mlperf-nvidia:minigo' }, MASK: {}, GNMT: { 'DATADIR': '/data/gnmt' }, SSD: { 'DATADIR': '/data' }, BERT: {} } env.update(envs[benchmark_spec.benchmark]) run_script = posixpath.join(run_path, 'run_with_docker.sh') vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script) vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t', run_script) vm_util.ReplaceText(vm, 'nvidia-docker', 'sudo nvidia-docker', run_script) vm_util.ReplaceText(vm, 'docker exec', 'sudo docker exec', run_script) vm_util.ReplaceText(vm, 'docker container', 'sudo docker container', run_script) if benchmark_spec.benchmark == MASK: vm_util.ReplaceText( vm, r'_cont_mounts=\(', r'_cont_mounts=\(\"--volume=\${PKLDIR}:\/pkl_coco\" ', run_script) env = ' '.join(f'{key}={value}' for key, value in env.items()) if nvidia_driver.CheckNvidiaGpuExists(vm): env = f'{tensorflow.GetEnvironmentVars(vm)} {env}' mlperf_benchmark_cmd = (f'chmod 755 {run_script} && ' f'cd {run_path} && ' f'{env} {run_script}') samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) if NONE in FLAGS.mlperf_profiler: samples.extend( MakeSamplesFromOutput(metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def PrepareRunner(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = vm or benchmark_spec.vms[0] if benchmark_spec.tpus: if vm == benchmark_spec.vms[0]: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service if FLAGS.mlperf_bucket: bucket = FLAGS.mlperf_bucket benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}' else: bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri) benchmark_spec.model_dir = f'gs://{bucket}' benchmark_spec.bucket = bucket location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER, bucket) # For MLPerf 1.0, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') if MASK in benchmark_spec.benchmark: model = 'mask_rcnn' elif GNMT in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark code_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') if (MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark): # Install the coco package, to load the coco dataset for Mask-RCNN # and SSD benchmarks. # TODO(user): coco whl package for python 3.5 vm.RemoteCommand( 'cd /tmp && ' f'wget https://storage.cloud.google.com/mlperf_artifcats/{MLPERF_VERSION}_training/coco-1.1-cp36-cp36m-linux_x86_64.whl' ) setup_script = posixpath.join(run_path, 'setup.sh') vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script) vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script) vm.RemoteCommand( 'chmod 755 {script} && {script}'.format(script=setup_script)) if MASK not in benchmark_spec.benchmark: vm.RemoteCommand( 'pip3 uninstall -y tf-estimator-nightly && ' 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801') if RESNET in benchmark_spec.benchmark: data_dir = benchmark_spec.imagenet_data_dir elif TRANSFORMER in benchmark_spec.benchmark: data_dir = benchmark_spec.wmt_data_dir elif MASK in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif GNMT in benchmark_spec.benchmark: data_dir = benchmark_spec.gnmt_data_dir elif SSD in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif BERT in benchmark_spec.benchmark: data_dir = benchmark_spec.bert_data_dir else: raise ValueError( 'Unknown operation, cannot find {} in benchmark'.format( benchmark_spec.benchmark)) run_script = posixpath.join(run_path, 'run_and_time.sh') data_dir = data_dir.replace('/', r'\/') checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/') decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/') tpu = benchmark_spec.tpu_groups['train'].GetName() vm_util.ReplaceText(vm, '--model_dir=.*', r'--model_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--training_file_pattern=.*', r'--training_file_pattern={}\/train-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--validation_file_pattern=.*', r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--val_json_file=.*', r'--val_json_file={}\/instances_val2017.json \\\\'.format( data_dir), run_script) vm_util.ReplaceText(vm, '--resnet_checkpoint=.*', r'--resnet_checkpoint={} \\\\'.format(checkpoint), run_script) vm_util.ReplaceText( vm, '--decode_from_file=.*', r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_reference=.*', r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_to_file=.*', r'--decode_to_file={}\/decode.transformer_mlperf_tpu.' r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--output_dir=.*', r'--output_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--cloud_tpu_name=.*', r'--cloud_tpu_name={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--out_dir=.*', r'--out_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script) vm.RemoteCommand('chmod 755 {}'.format(run_script)) if GNMT in benchmark_spec.benchmark: metric_script = posixpath.join(code_path, model, 'metric.py') vm_util.ReplaceText(vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script) else: benchmark_spec.model_dir = '/tmp' has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nvidia_docker') vm.RemoteCommand( 'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi') if RESNET in benchmark_spec.benchmark: run_script = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet/run_and_time.sh' vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet &&' ' sudo docker build --network=host . -t mlperf-nvidia:image_classification', should_log=True) _DownloadData(benchmark_spec.imagenet_data_dir, posixpath.join('/data', 'imagenet'), vm) if TRANSFORMER in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/transformer/implementations/pytorch &&' ' sudo docker build --network=host . -t mlperf-nvidia:translation', should_log=True) _DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm) if MINIGO in benchmark_spec.benchmark: build_path = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/minigo/implementations/tensorflow' run_script = posixpath.join(build_path, 'run_and_time.sh') vm_util.ReplaceText( vm, 'get_data.py', 'get_data.py --src_dir={}'.format( FLAGS.minigo_model_dir.replace('/', r'\/')), run_script) vm.RemoteCommand('cd {} && sudo docker build --network=host -t ' 'mlperf-nvidia:minigo .'.format(build_path), should_log=True) if MASK in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:object_detection . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if GNMT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/gnmt/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ', should_log=True) _DownloadData(benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm) if SSD in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/ssd/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if BERT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:language_model . ', should_log=True) _DownloadData(benchmark_spec.bert_data_dir, posixpath.join('/data', 'bert_data'), vm)
def Run(benchmark_spec): """Run MNIST on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: mnist_benchmark_script = 'mnist_tpu.py' mnist_benchmark_cmd = ('cd tpu/models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/mnist && ' 'python {script} ' '--data_dir={data_dir} ' '--iterations={iterations} ' '--model_dir={model_dir} ' '--batch_size={batch_size}'.format( script=mnist_benchmark_script, data_dir=benchmark_spec.data_dir, iterations=benchmark_spec.iterations, model_dir=benchmark_spec.model_dir, batch_size=benchmark_spec.batch_size)) else: mnist_benchmark_script = 'mnist.py' mnist_benchmark_cmd = ('cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/mnist && ' 'python {script} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--batch_size={batch_size} '.format( script=mnist_benchmark_script, data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, batch_size=benchmark_spec.batch_size)) if nvidia_driver.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mnist_benchmark_cmd) samples = [] metadata = CreateMetadataDict(benchmark_spec) if benchmark_spec.train_steps > 0: if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_shards = '' if benchmark_spec.tpus: mnist_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} ' '{num_shards} --noenable_predict'.format( cmd=mnist_benchmark_cmd, tpu=tpu, use_tpu=bool(benchmark_spec.tpus), train_steps=benchmark_spec.train_steps, num_shards=num_shards)) else: mnist_benchmark_train_cmd = ( '{cmd} --train_epochs={train_epochs} '.format( cmd=mnist_benchmark_cmd, train_epochs=benchmark_spec.train_epochs)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd, should_log=True) elapsed_seconds = (time.time() - start) samples.extend( MakeSamplesFromTrainOutput(metadata, stdout + stderr, elapsed_seconds, benchmark_spec.train_steps)) if benchmark_spec.eval_steps > 0: if benchmark_spec.tpus: mnist_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --use_tpu={use_tpu} --eval_steps={eval_steps}' .format(cmd=mnist_benchmark_cmd, use_tpu=bool(benchmark_spec.tpus), tpu=benchmark_spec.tpu_groups['eval'].GetName(), eval_steps=benchmark_spec.eval_steps)) else: mnist_benchmark_eval_cmd = ( '{cmd} --eval_steps={eval_steps}'.format( cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps)) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput(metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run MLPerf on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark mlperf_benchmark_cmd = ('cd {code_path} && ' 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' 'cd {model} && ' '{run_path}/run_and_time.sh'.format( code_path=code_path, model=model, run_path=run_path)) if 'ssd' in benchmark_spec.benchmark: mlperf_benchmark_cmd = ( 'export ' 'MLP_GCS_RESNET_CHECKPOINT={checkpoint}' ' && {cmd}'.format( checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, cmd=mlperf_benchmark_cmd)) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') else: benchmark_path = '$HOME/training_results_v0.6/NVIDIA/benchmarks' common_env = 'DGXSYSTEM=DGX1 NEXP=1' if 'resnet' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'resnet/implementations/mxnet') env = 'DATADIR=/data/imagenet LOGDIR=/tmp/resnet PULL=0' elif 'transformer' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'transformer/implementations/pytorch') env = 'DATADIR=/data/wmt/utf8 LOGDIR=/tmp/transformer PULL=0' elif 'minigo' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'minigo/implementations/tensorflow') env = 'LOGDIR=/tmp/minigo CONT=mlperf-nvidia:minigo' elif 'mask' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'maskrcnn/implementations/pytorch') env = 'LOGDIR=/tmp/mask DATADIR=/data PULL=0' elif 'gnmt' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'gnmt/implementations/pytorch') env = 'LOGDIR=/tmp/gnmt DATADIR=/data/gnmt PULL=0' elif 'ssd' in benchmark_spec.benchmark: run_path = posixpath.join(benchmark_path, 'ssd/implementations/pytorch') env = 'LOGDIR=/tmp/ssd DATADIR=/data PULL=0' run_script = posixpath.join(run_path, 'run.sub') vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script) mlperf_benchmark_cmd = ( 'cd {run_path} && chmod 755 run.sub && sudo {common_env} {env} ' './run.sub'.format(run_path=run_path, common_env=common_env, env=env)) if nvidia_driver.CheckNvidiaGpuExists(vm): mlperf_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True) samples.extend( MakeSamplesFromOutput( metadata, stdout, use_tpu=bool(benchmark_spec.tpus), model=benchmark_spec.benchmark)) return samples
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if nvidia_driver.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=nvidia_driver.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples