def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_node = nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0]) num_vms = len(benchmark_spec.vms) total_gpus = gpus_per_node * num_vms benchmark_spec.gpus_per_node = gpus_per_node benchmark_spec.num_vms = num_vms benchmark_spec.total_gpus = total_gpus benchmark_spec.model = FLAGS.horovod_model benchmark_spec.batch_size = FLAGS.horovod_batch_size benchmark_spec.num_steps = FLAGS.horovod_num_steps benchmark_spec.precision = FLAGS.horovod_precision benchmark_spec.max_seq_len = int(FLAGS.horovod_max_seq_len) benchmark_spec.bert_finetune = FLAGS.horovod_bert_finetune benchmark_spec.timeline = FLAGS.horovod_timelime benchmark_spec.synthetic = FLAGS.horovod_synthetic benchmark_spec.cuda_visible_devices = FLAGS.nccl_cuda_visible_devices benchmark_spec.nccl_version = FLAGS.nccl_version benchmark_spec.nccl_net_plugin = FLAGS.nccl_net_plugin benchmark_spec.nccl_extra_params = FLAGS.nccl_extra_params
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_vm = nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0]) num_vms = len(benchmark_spec.vms) total_num_gpus = gpus_per_vm * num_vms benchmark_spec.gpus_per_vm = gpus_per_vm benchmark_spec.num_vms = num_vms benchmark_spec.total_num_gpus = total_num_gpus benchmark_spec.zones = FLAGS.zones # pylint: disable=protected-access mlperf_benchmark._UpdateBenchmarkSpecWithFlags(benchmark_spec) # pylint: enable=protected-access storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service if FLAGS.mlperf_bucket: benchmark_spec.bucket = FLAGS.mlperf_bucket benchmark_spec.model_dir = 'gs://{bucket}/pkb-{uri}'.format( bucket=FLAGS.mlperf_bucket, uri=FLAGS.run_uri) else: benchmark_spec.bucket = None benchmark_spec.model_dir = None
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: metadata dict """ metadata = { 'use_tpu': bool(benchmark_spec.tpus), 'model_dir': benchmark_spec.model_dir, 'model': benchmark_spec.benchmark, 'version': MLPERF_VERSION, } vms = benchmark_spec.vms num_vms = len(vms) vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) total_gpus = gpus_per_node * num_vms metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['total_gpus'] = total_gpus if benchmark_spec.tpus: metadata.update({ 'train_tpu_num_shards': benchmark_spec.tpu_groups['train'].GetNumShards(), 'train_tpu_accelerator_type': benchmark_spec.tpu_groups['train'].GetAcceleratorType() }) return metadata
def _CreateMetadataDict(vms): """Create metadata dict to be used in run results. Args: vms: A list of worker VMs. Returns: metadata dict """ vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) num_vms = len(vms) total_gpus = gpus_per_node * num_vms metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['num_nodes'] = len(vms) metadata['total_gpus'] = int(total_gpus) metadata['model'] = FLAGS.horovod_model metadata['batch_size'] = FLAGS.horovod_batch_size metadata['num_steps'] = FLAGS.horovod_num_steps metadata['synthetic'] = FLAGS.horovod_synthetic metadata['precision'] = FLAGS.horovod_precision metadata['max_seq_len'] = int(FLAGS.horovod_max_seq_len) metadata['nccl_version'] = FLAGS.nccl_version metadata['nccl_net_plugin'] = FLAGS.nccl_net_plugin metadata['cuda_visible_devices'] = FLAGS.nccl_cuda_visible_devices metadata['nccl_extra_params'] = FLAGS.nccl_extra_params return metadata
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ benchmark_spec.max_sentences = FLAGS.robertammlm_max_sentences benchmark_spec.nproc_per_node = FLAGS.robertammlm_nproc_per_node benchmark_spec.log_interval = FLAGS.robertammlm_log_interval benchmark_spec.profiler = FLAGS.robertammlm_profiler benchmark_spec.max_epoch = FLAGS.robertammlm_max_epoch vms = benchmark_spec.vms vm = vms[0] num_vms = len(vms) benchmark_spec.num_vms = num_vms benchmark_spec.global_batch_size = FLAGS.robertammlm_global_batch_size num_accelerators = nvidia_driver.QueryNumberOfGpus(vm) * num_vms benchmark_spec.num_accelerators = num_accelerators if FLAGS.robertammlm_update_freq: benchmark_spec.update_freq = FLAGS.robertammlm_update_freq else: benchmark_spec.update_freq = ( benchmark_spec.global_batch_size // (benchmark_spec.max_sentences * num_accelerators)) if FLAGS.robertammlm_num_copies: benchmark_spec.num_copies = FLAGS.robertammlm_num_copies else: benchmark_spec.num_copies = max(1, num_accelerators // 32)
def Install(vm): """Installs XGBoost on the VM.""" vm.Install('build_tools') install_dir = posixpath.join(linux_packages.INSTALL_DIR, 'xgboost') vm.RemoteCommand( f'git clone --recursive https://github.com/dmlc/xgboost {install_dir}') nccl_make_option = '' nccl_install_option = '' if nvidia_driver.QueryNumberOfGpus(vm) > 1: nccl_make_option = '-DUSE_NCCL=ON -DNCCL_ROOT=/usr/local/nccl2' nccl_install_option = '--use-nccl' cuda_env = '' cuda_make_option = '' cuda_install_option = '' if nvidia_driver.CheckNvidiaGpuExists: cuda_make_option = '-DUSE_CUDA=ON' cuda_env = 'CUDACXX=/usr/local/cuda/bin/nvcc' cuda_install_option = '--use-cuda' build_dir = posixpath.join(install_dir, 'build') package_dir = posixpath.join(install_dir, 'python-package') vm.RemoteCommand(f'mkdir -p {build_dir}') vm.RemoteCommand(f'cd {build_dir} && ' f'{cuda_env} cmake .. {cuda_make_option} {nccl_make_option}') vm.RemoteCommand(f'cd {build_dir} && make -j4') vm.RemoteCommand(f'cd {package_dir} && ' f'{_ENV.value} python3 setup.py install ' f'{cuda_install_option} {nccl_install_option}')
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run CUDA memcopy on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ if not nvidia_driver.CheckNvidiaGpuExists(vm): return [] if not nvidia_driver.CheckNvidiaSmiExists(vm): return [] global_metadata = _MetadataFromFlags() global_metadata.update(cuda_toolkit.GetMetadata(vm)) global_cmd = [ BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}', f'--mode={_MODE.value}' ] if _HTOD.value: global_cmd.append('--htod') if _DTOH.value: global_cmd.append('--dtoh') if _DTOD.value: global_cmd.append('--dtod') if _WC.value: global_cmd.append('--wc') num_gpus = nvidia_driver.QueryNumberOfGpus(vm) devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else []) samples = [] for device in devices: cmd = ' '.join(global_cmd + [f'--device={device}']) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( cmd, ignore_failure=True) if exit_code: logging.warning('Error with getting GPU stats: %s', stderr) continue results = regex_util.ExtractAllMatches( r'bandwidthTest-(\S+), ' r'Bandwidth = ([\d\.]+) (\S+), ' r'Time = ([\d\.]+) s, ' r'Size = (\d+) bytes, ' r'NumDevsUsed = (\d+)', stdout) for metric, bandwidth, unit, time, size, num_devs_used in results: metadata = { 'time': float(time), 'size': int(size), 'NumDevsUsed': num_devs_used, 'device': device, 'command': cmd, } metadata.update(global_metadata) samples.append( sample.Sample(metric, float(bandwidth), unit, metadata)) return samples
def Run(benchmark_spec): """Run MXNet on the cluster for each model specified. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] mx_benchmark_dir = 'incubator-mxnet/example/image-classification' results = [] for model in FLAGS.mx_models: num_layers = _GetNumLayers(model) batch_size = _GetBatchSize(model, num_layers) benchmark_spec.model = model benchmark_spec.batch_size = batch_size benchmark_spec.num_layers = num_layers benchmark_spec.image_shape = _GetImageShape(model) mx_benchmark_cmd = ('python train_imagenet.py ' '--benchmark=1 ' '--network={network} ' '--batch-size={batch_size} ' '--image-shape={image_shape} ' '--num-epochs={num_epochs} ' '--dtype={precision} ' '--kv-store={key_value_store}').format( network=model, batch_size=batch_size, image_shape=benchmark_spec.image_shape, num_epochs=benchmark_spec.num_epochs, precision=benchmark_spec.precision, key_value_store=benchmark_spec.key_value_store) if benchmark_spec.device == GPU: num_gpus = nvidia_driver.QueryNumberOfGpus(vm) mx_benchmark_cmd = '{env} {cmd} --gpus {gpus}'.format( env=mxnet.GetEnvironmentVars(vm), cmd=mx_benchmark_cmd, gpus=','.join(str(n) for n in range(num_gpus))) elif benchmark_spec.device == CPU: # Specifies the number of threads to use in CPU test. # https://mxnet.incubator.apache.org/faq/perf.html mx_benchmark_cmd = 'OMP_NUM_THREADS={omp_num_threads} {cmd}'.format( omp_num_threads=vm.NumCpusForBenchmark() // 2, cmd=mx_benchmark_cmd) if num_layers: mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd, num_layers) run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd) stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True) results.append(_MakeSamplesFromOutput(benchmark_spec, stdout or stderr)) return results
def Prepare(benchmark_spec): """Install SHOC and push the machinefile. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ vm_util.RunThreaded(_InstallAndAuthenticateVm, benchmark_spec.vms) master_vm = benchmark_spec.vms[0] benchmark_spec.num_gpus = nvidia_driver.QueryNumberOfGpus(master_vm) hpc_util.CreateMachineFile(benchmark_spec.vms, lambda _: benchmark_spec.num_gpus, MACHINEFILE)
def Prepare(benchmark_spec): """Install and set up RoBERTa mmlm on the target vm.. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms benchmark_spec.always_call_cleanup = True list_params = [((benchmark_spec, rank), {}) for rank in range(benchmark_spec.num_vms)] vm_util.RunThreaded(_PrepareVm, list_params) master = vms[0] if nvidia_driver.CheckNvidiaGpuExists(master): gpus_per_vm = nvidia_driver.QueryNumberOfGpus(master) hpc_util.CreateMachineFile(vms, lambda _: gpus_per_vm, HOSTFILE)
def _UpdateBenchmarkSpecWithFlags(benchmark_spec): """Update the benchmark_spec with supplied command line flags. Args: benchmark_spec: benchmark specification to update """ gpus_per_node = (FLAGS.hpcg_gpus_per_node or nvidia_driver.QueryNumberOfGpus(benchmark_spec.vms[0])) cpus_per_rank = int(benchmark_spec.vms[0].NumCpusForBenchmark() / gpus_per_node) num_vms = len(benchmark_spec.vms) total_gpus = gpus_per_node * num_vms benchmark_spec.gpus_per_node = gpus_per_node benchmark_spec.cpus_per_rank = cpus_per_rank benchmark_spec.num_vms = num_vms benchmark_spec.total_gpus = total_gpus benchmark_spec.hpcg_problem_size = FLAGS.hpcg_problem_size benchmark_spec.hpcg_runtime = FLAGS.hpcg_runtime benchmark_spec.run_as_root = FLAGS.mpirun_allow_run_as_root
def _CreateMetadataDict( bm_spec: benchmark_spec.BenchmarkSpec) -> Dict[str, Any]: """Creates metadata dict to be used in run results. Args: bm_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: metadata dict """ metadata = { 'model': FLAGS.mlperf_benchmark, 'version': MLPERF_INFERENCE_VERSION, } vms = bm_spec.vms num_vms = len(vms) vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) total_gpus = gpus_per_node * num_vms metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['total_gpus'] = total_gpus return metadata
def _GetTfCnnBenchmarkCommand(vm, model, batch_size, benchmark_spec, args='', job_name=''): """Create the command used to run the tf_cnn_benchmarks script. The command is either formulated using flag values stored on the benchmark_spec, or is essentially provided outright through the benchmark_args flag. Args: vm: the VM to run on. model: name of the model to run. batch_size: batch size to use for training. benchmark_spec: the benchmark spec object. args: string, distributed arguments job_name: string, distributed job name Returns: A string that runs the tf_cnn_benchmarks.py script with the desired arguments. """ num_gpus = (nvidia_driver.QueryNumberOfGpus(vm) if nvidia_driver.CheckNvidiaGpuExists(vm) else 0) benchmark_spec.num_gpus = num_gpus if benchmark_spec.benchmark_args is not None: cmd = 'python tf_cnn_benchmarks.py ' + benchmark_spec.benchmark_args # If the user didn't specify num_gpus in the benchmark_args string, # use all the GPUs on the system. if '--num_gpus' not in benchmark_spec.benchmark_args and num_gpus: cmd = '{cmd} --num_gpus={num_gpus}'.format(cmd=cmd, num_gpus=num_gpus) return cmd benchmark_spec.local_parameter_device = FLAGS.tf_local_parameter_device benchmark_spec.device = FLAGS.tf_device benchmark_spec.data_format = FLAGS.tf_data_format if num_gpus == 0: benchmark_spec.local_parameter_device = CPU benchmark_spec.device = CPU benchmark_spec.data_format = NHWC cmd = ('{env_vars} python tf_cnn_benchmarks.py ' '--local_parameter_device={local_parameter_device} ' '--batch_size={batch_size} ' '--model={model} ' '{data} ' '--data_name={data_name} ' '--variable_update={variable_update} ' '--distortions={distortions} ' '--device={device} ' '--data_format={data_format} ' '--forward_only={forward_only} ' '--use_fp16={use_fp16} ' '{num_gpus} ' '{job_name}'.format( env_vars=tensorflow.GetEnvironmentVars(vm), local_parameter_device=benchmark_spec.local_parameter_device, batch_size=batch_size, model=model, data=('--data_dir={}'.format(benchmark_spec.data_dir) if benchmark_spec.data_dir else ''), data_name=benchmark_spec.data_name, variable_update=benchmark_spec.variable_update, distortions=benchmark_spec.distortions, device=benchmark_spec.device, data_format=benchmark_spec.data_format, forward_only=benchmark_spec.forward_only, use_fp16=(benchmark_spec.precision == FP16), num_gpus='--num_gpus={}'.format(num_gpus) if num_gpus else '', job_name='--job_name={0} {1}'.format(job_name, args) if args else '')) return cmd
def testQueryNumberOfGpus(self): vm = mock.MagicMock() vm.RemoteCommand = mock.MagicMock(return_value=('count\n8', None)) self.assertEqual(8, nvidia_driver.QueryNumberOfGpus(vm))
def RunWithVMs(vms, extra_envs=None): """Run Horovod on the cluster. Args: vms: A list of worker VMs. extra_envs: A dictionary of environment variables. Returns: A list of sample.Sample objects. """ vm_util.RunThreaded(lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms) master_vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(master_vm) num_vms = len(vms) total_gpus = gpus_per_node * num_vms # GCP should work out of the box with the deep learning image but the AWS # image requires us to use the correct Tensorflow Python environment. if FLAGS.cloud == 'AWS': master_vm.RobustRemoteCommand( '. anaconda3/bin/activate tensorflow_p37') python_interpreter = 'anaconda3/envs/tensorflow_p37/bin/python' else: python_interpreter = '/opt/conda/bin/python' nccl_params = { 'TF_CPP_MIN_LOG_LEVEL': 0, 'NCCL_SOCKET_IFNAME': '^lo,docker0', 'NCCL_DEBUG': 'INFO', } if FLAGS.horovod_timeline: nccl_params['HOROVOD_TIMELINE_MARK_CYCLES'] = 1 nccl_params['HOROVOD_TIMELINE'] = f'{vm_util.VM_TMP_DIR}/timeline.json' if FLAGS.nccl_cuda_visible_devices: nccl_params['CUDA_VISIBLE_DEVICES'] = FLAGS.nccl_cuda_visible_devices if FLAGS.nccl_extra_params: for extra_param in FLAGS.nccl_extra_params: k, v = extra_param.split('=', 1) nccl_params[k] = v if extra_envs: nccl_params.update(extra_envs) run_command = ('{mpi} -np {num_gpus} -hostfile {host_file} ' '-mca plm_rsh_no_tree_spawn 1 ' '--allow-run-as-root ' '-bind-to socket -map-by slot ' '{nccl_params} ' '-mca pml ob1 -mca btl ^openib ' '-mca btl_tcp_if_exclude lo,docker0 ' '{python} ').format( mpi=FLAGS.nccl_mpi, num_gpus=total_gpus, host_file=MACHINEFILE, python=python_interpreter, nccl_params=' '.join([ f'-x {key}={value}' for key, value in nccl_params.items() ])) if FLAGS.horovod_model == 'resnet-50': run_flags = { 'arch': 'resnet50', 'mode': 'training_benchmark', 'warmup_steps': 101, 'results_dir': '/tmp/models', 'gpu_memory_fraction': 0.95, 'static_loss_scale': 128, 'lr_init': 0.016, 'lr_warmup_epochs': 8, 'momentum': 0.875, 'weight_decay': 3.0517578125e-05, 'iter_unit': 'batch' } run_flags.update({ 'batch_size': FLAGS.horovod_batch_size, 'num_iter': FLAGS.horovod_num_steps, }) if FLAGS.horovod_precision == 'fp16': run_flags['amp'] = None # Load ImageNet training data from GCS if benchmark is not in synthetic mode if not FLAGS.horovod_synthetic: run_flags[ 'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet' run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) elif FLAGS.horovod_model == 'resnext-101': run_flags = { 'arch': 'resnext101-32x4d', 'mode': 'training_benchmark', 'warmup_steps': 101, 'results_dir': '/tmp/models', 'gpu_memory_fraction': 0.95, 'use_static_loss_scaling': None, 'loss_scale': 128, 'lr_init': 0.016, 'lr_warmup_epochs': 8, 'momentum': 0.875, 'weight_decay': 3.0517578125e-05, 'weight_init': 'fan_in', 'iter_unit': 'batch' } run_flags.update({ 'precision': FLAGS.horovod_precision, 'batch_size': FLAGS.horovod_batch_size, 'num_iter': FLAGS.horovod_num_steps, }) # Load ImageNet training data from GCS if benchmark is not in synthetic mode if not FLAGS.horovod_synthetic: run_flags[ 'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet' run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) elif FLAGS.horovod_model.startswith('bert'): # bert if not FLAGS.horovod_bert_finetune: raise NotImplementedError('BERT pretraining is not supported.') bert_dir = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/google_pretrained_weights/{}'.format( 'uncased_L-12_H-768_A-12' if FLAGS.horovod_model == 'bert-base' else 'uncased_L-24_H-1024_A-16') squad_train_file = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json' run_flags = { 'vocab_file': '{}/vocab.txt'.format(bert_dir), 'bert_config_file': '{}/bert_config.json'.format(bert_dir), 'init_checkpoint': '{}/bert_model.ckpt'.format(bert_dir), 'do_train': None, 'train_file': squad_train_file, 'learning_rate': 5e-6, 'output_dir': '/tmp/models', 'horovod': None, 'dllog_path': '/tmp/bert_dllog.json', 'save_checkpoints_steps': 0, } run_flags.update({ 'precision': FLAGS.horovod_precision, 'train_batch_size': FLAGS.horovod_batch_size, 'num_train_epochs': FLAGS.horovod_num_steps, 'max_seq_length': FLAGS.horovod_max_seq_len, 'doc_stride': 64 if FLAGS.horovod_max_seq_len == 128 else 128, 'amp': FLAGS.horovod_precision == 'fp16' }) run_command += 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) else: run_command += ( 'tensorpack/examples/FasterRCNN/train.py --config ' 'BACKBONE.WEIGHTS=ImageNet-R50-AlignPadding.npz ' 'DATA.BASEDIR=coco ' 'TRAINER=horovod ' 'TRAIN.EVAL_PERIOD=0 ' # LR_SCHEDULE means equivalent steps when the total batch size is 8. 'TRAIN.LR_SCHEDULE="[{step}, {step}, {step}]" ' '--logdir {log_dir}/maskrcnn ').format( log_dir=vm_util.VM_TMP_DIR, step=FLAGS.horovod_num_steps * total_gpus // 8) stdout, stderr = master_vm.RobustRemoteCommand(run_command, should_log=True) if FLAGS.horovod_timeline: master_vm.PullFile(vm_util.GetTempDir(), '{}/timeline.json'.format(vm_util.VM_TMP_DIR)) return _MakeSamplesFromOutput(vms, stdout, stderr)
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if nvidia_driver.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=nvidia_driver.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples