def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['num_nodes'] = len(benchmark_spec.vms) metadata['total_gpus'] = int(benchmark_spec.total_gpus) metadata['model'] = benchmark_spec.model metadata['batch_size'] = benchmark_spec.batch_size metadata['num_steps'] = benchmark_spec.num_steps metadata['synthetic'] = benchmark_spec.synthetic metadata['precision'] = benchmark_spec.precision metadata['max_seq_len'] = benchmark_spec.max_seq_len metadata['nccl_version'] = benchmark_spec.nccl_version metadata['nccl_net_plugin'] = benchmark_spec.nccl_net_plugin metadata['cuda_visible_devices'] = benchmark_spec.cuda_visible_devices metadata['nccl_extra_params'] = benchmark_spec.nccl_extra_params return metadata
def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> List[sample.Sample]: """Run GPU PingPong test. It tests the latency between 2 GPU in 2 VMs using TensorFlow gPRC server which were started during prepare phrase. Args: bm_spec: The benchmark specification Returns: A list of sample.Sample objects. """ client_vm, server_vm = bm_spec.vms server_address = _SERVER_ADDR.format(hostname=server_vm.hostname, port=_PORT) base_metadata = cuda_toolkit.GetMetadata(client_vm) samples = [] bws = _RunGpuPingpong(client_vm, server_address) for ping_bw, pong_bw in bws[1:]: metadata = {'ping': 32 / ping_bw, 'pong': 32 / pong_bw} metadata.update(base_metadata) samples.append( sample.Sample('latency', 32 / ping_bw + 32 / pong_bw, 'microseconds', metadata)) return samples
def _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec model: model which was run batch_size: batch sized used num_gpus: number of GPUs used Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() if cuda_toolkit.CheckNvidiaGpuExists(vm): metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['num_gpus'] = num_gpus metadata['model'] = model metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only metadata['data_name'] = benchmark_spec.data_name metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device metadata['data_format'] = benchmark_spec.data_format metadata['distortions'] = benchmark_spec.distortions metadata['benchmarks_commit_hash'] = benchmark_spec.benchmarks_commit_hash metadata['tensorflow_version'] = benchmark_spec.tensorflow_version metadata['tensorflow_cpu_pip_package'] = ( benchmark_spec.tensorflow_cpu_pip_package) metadata['tensorflow_gpu_pip_package'] = ( benchmark_spec.tensorflow_gpu_pip_package) metadata['distributed'] = benchmark_spec.distributed return metadata
def Run(benchmark_spec): """Runs the Stencil2D benchmark. GPU clock speeds must be set already. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ vms = benchmark_spec.vms num_gpus = benchmark_spec.num_gpus master_vm = vms[0] num_iterations = FLAGS.stencil2d_iterations problem_sizes = FLAGS.stencil2d_problem_sizes num_processes = len(vms) * num_gpus metadata = {} metadata.update(cuda_toolkit.GetMetadata(master_vm)) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['num_iterations'] = num_iterations metadata['num_nodes'] = len(vms) metadata['num_processes'] = num_processes results = [] for problem_size in problem_sizes: results.extend( _RunSingleIteration(master_vm, problem_size, num_processes, num_iterations, metadata)) return results
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: metadata dict """ metadata = { 'use_tpu': bool(benchmark_spec.tpus), 'model_dir': benchmark_spec.model_dir, 'model': benchmark_spec.benchmark, 'version': MLPERF_VERSION, } vms = benchmark_spec.vms num_vms = len(vms) vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) total_gpus = gpus_per_node * num_vms metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['total_gpus'] = total_gpus if benchmark_spec.tpus: metadata.update({ 'train_tpu_num_shards': benchmark_spec.tpu_groups['train'].GetNumShards(), 'train_tpu_accelerator_type': benchmark_spec.tpu_groups['train'].GetAcceleratorType() }) return metadata
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = { 'batch_size': benchmark_spec.batch_size, 'num_epochs': benchmark_spec.num_epochs, 'device': benchmark_spec.device, 'num_layers': benchmark_spec.num_layers, 'model': benchmark_spec.model, 'mxnet_version': benchmark_spec.mxnet_version, 'precision': benchmark_spec.precision, 'key_value_store': benchmark_spec.key_value_store, 'image_shape': benchmark_spec.image_shape, 'commit': mxnet_cnn.GetCommit(vm) } if benchmark_spec.device == GPU: metadata.update(cuda_toolkit.GetMetadata(vm)) return metadata
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run XGBoost on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ cmd = [ f'{FLAGS.xgboost_env}', 'python3', f'{linux_packages.INSTALL_DIR}/xgboost/tests/benchmark/benchmark_tree.py', f'--tree_method={_TREE_METHOD.value}', f'--sparsity={_SPARSITY.value}', f'--rows={_ROWS.value}', f'--columns={_COLUMNS.value}', f'--iterations={_ITERATIONS.value}', f'--test_size={_TEST_SIZE.value}', ] if _PARAMS.value: cmd.append(f'--params="{_PARAMS.value}"') metadata = _MetadataFromFlags(vm) metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['command'] = ' '.join(cmd) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( metadata['command'], ignore_failure=True) if exit_code: logging.warning('Error with getting XGBoost stats: %s', stderr) training_time = regex_util.ExtractFloat(r'Train Time: ([\d\.]+) seconds', stdout) return sample.Sample('training_time', training_time, 'seconds', metadata)
def _CreateMetadataDict(vms): """Create metadata dict to be used in run results. Args: vms: A list of worker VMs. Returns: metadata dict """ vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) num_vms = len(vms) total_gpus = gpus_per_node * num_vms metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['num_nodes'] = len(vms) metadata['total_gpus'] = int(total_gpus) metadata['model'] = FLAGS.horovod_model metadata['batch_size'] = FLAGS.horovod_batch_size metadata['num_steps'] = FLAGS.horovod_num_steps metadata['synthetic'] = FLAGS.horovod_synthetic metadata['precision'] = FLAGS.horovod_precision metadata['max_seq_len'] = int(FLAGS.horovod_max_seq_len) metadata['nccl_version'] = FLAGS.nccl_version metadata['nccl_net_plugin'] = FLAGS.nccl_net_plugin metadata['cuda_visible_devices'] = FLAGS.nccl_cuda_visible_devices metadata['nccl_extra_params'] = FLAGS.nccl_extra_params return metadata
def _CollectGpuSamples( vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]: """Run CUDA memcopy on the cluster. Args: vm: The virtual machine to run the benchmark. Returns: A list of sample.Sample objects. """ if not nvidia_driver.CheckNvidiaGpuExists(vm): return [] if not nvidia_driver.CheckNvidiaSmiExists(vm): return [] global_metadata = _MetadataFromFlags() global_metadata.update(cuda_toolkit.GetMetadata(vm)) global_cmd = [ BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}', f'--mode={_MODE.value}' ] if _HTOD.value: global_cmd.append('--htod') if _DTOH.value: global_cmd.append('--dtoh') if _DTOD.value: global_cmd.append('--dtod') if _WC.value: global_cmd.append('--wc') num_gpus = nvidia_driver.QueryNumberOfGpus(vm) devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else []) samples = [] for device in devices: cmd = ' '.join(global_cmd + [f'--device={device}']) stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode( cmd, ignore_failure=True) if exit_code: logging.warning('Error with getting GPU stats: %s', stderr) continue results = regex_util.ExtractAllMatches( r'bandwidthTest-(\S+), ' r'Bandwidth = ([\d\.]+) (\S+), ' r'Time = ([\d\.]+) s, ' r'Size = (\d+) bytes, ' r'NumDevsUsed = (\d+)', stdout) for metric, bandwidth, unit, time, size, num_devs_used in results: metadata = { 'time': float(time), 'size': int(size), 'NumDevsUsed': num_devs_used, 'device': device, 'command': cmd, } metadata.update(global_metadata) samples.append( sample.Sample(metric, float(bandwidth), unit, metadata)) return samples
def _CreateMetadataDict(benchmark_spec, model, batch_size): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec model: model which was run batch_size: batch sized used Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = {} if cuda_toolkit.CheckNvidiaGpuExists(vm): metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['command_line'] = benchmark_spec.tf_cnn_benchmark_cmd metadata['cnn_benchmarks_branch'] = benchmark_spec.cnn_benchmarks_branch metadata['tensorflow_version'] = benchmark_spec.tensorflow_version metadata['tensorflow_cpu_pip_package'] = ( benchmark_spec.tensorflow_cpu_pip_package) metadata['tensorflow_gpu_pip_package'] = ( benchmark_spec.tensorflow_gpu_pip_package) # If we ran a custom command-line through the benchmark_args flag, # add the metadata from that command and return. We don't need anymore # metadata from this function as it is likely invalid. if getattr(benchmark_spec, 'benchmark_args', None): metadata.update( _GetMetadataFromBenchmarkArgs(benchmark_spec.benchmark_args)) return metadata metadata['model'] = model metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only metadata['data_name'] = benchmark_spec.data_name metadata['data_dir'] = benchmark_spec.data_dir metadata['use_local_data'] = benchmark_spec.use_local_data metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device metadata['data_format'] = benchmark_spec.data_format metadata['distortions'] = benchmark_spec.distortions metadata['distributed'] = benchmark_spec.distributed metadata['precision'] = benchmark_spec.precision metadata['num_gpus'] = benchmark_spec.num_gpus return metadata
def Run(benchmark_spec): """Sets the GPU clock speed and runs the CUDA PCIe benchmark. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ vm = benchmark_spec.vms[0] # Note: The clock speed is set in this function rather than Prepare() # so that the user can perform multiple runs with a specified # clock speed without having to re-prepare the VM. cuda_toolkit.SetAndConfirmGpuClocks(vm) num_iterations = FLAGS.gpu_pcie_bandwidth_iterations mode = FLAGS.gpu_pcie_bandwidth_mode transfer_size_range = FLAGS.gpu_pcie_bandwidth_transfer_sizes raw_results = [] metadata = {} metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['num_iterations'] = num_iterations metadata['mode'] = mode if mode == 'range': metadata['range_start'] = transfer_size_range[0] metadata['range_stop'] = transfer_size_range[1] metadata['range_step'] = transfer_size_range[2] run_command = ('%s/extras/demo_suite/bandwidthTest --device=all' % cuda_toolkit.CUDA_TOOLKIT_INSTALL_DIR) if mode == 'range': run_command += ( ' --mode=range --start={0} --end={1} --increment={2}'.format( transfer_size_range[0], transfer_size_range[1], transfer_size_range[2])) for i in range(num_iterations): stdout, _ = vm.RemoteCommand(run_command, should_log=True) raw_results.append(_ParseOutputFromSingleIteration(stdout)) if 'device_info' not in metadata: metadata['device_info'] = _ParseDeviceInfo(stdout) return _CalculateMetricsOverAllIterations(raw_results, metadata)
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['num_nodes'] = len(benchmark_spec.vms) metadata['total_gpus'] = int(benchmark_spec.total_gpus) metadata['model'] = benchmark_spec.model metadata['batch_size'] = benchmark_spec.batch_size metadata['deep_learning_examples_commit'] = ( benchmark_spec.deep_learning_examples_commit) return metadata
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() if benchmark_spec.device == GPU: metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['batch_size'] = benchmark_spec.batch_size metadata['num_epochs'] = benchmark_spec.num_epochs metadata['device'] = benchmark_spec.device metadata['num_layers'] = benchmark_spec.num_layers metadata['model'] = benchmark_spec.model metadata['mxnet_version'] = benchmark_spec.mxnet_version metadata['commit'] = mxnet_cnn.GetCommit(vm) return metadata
def _CreateMetadataDict( bm_spec: benchmark_spec.BenchmarkSpec) -> Dict[str, Any]: """Creates metadata dict to be used in run results. Args: bm_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: metadata dict """ metadata = { 'model': FLAGS.mlperf_benchmark, 'version': MLPERF_INFERENCE_VERSION, } vms = bm_spec.vms num_vms = len(vms) vm = vms[0] gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm) total_gpus = gpus_per_node * num_vms metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['total_gpus'] = total_gpus return metadata
def _CreateMetadataDict(benchmark_spec): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec Returns: metadata dict """ vm = benchmark_spec.vms[0] metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['num_nodes'] = len(benchmark_spec.vms) metadata['cpus_per_rank'] = int(benchmark_spec.cpus_per_rank) metadata['total_gpus'] = int(benchmark_spec.total_gpus) metadata['benchmark_version'] = BENCHMARK_VERSION metadata['runtime'] = int(benchmark_spec.hpcg_runtime) metadata['run_as_root'] = benchmark_spec.run_as_root metadata['problem_size'] = '%s,%s,%s' % (benchmark_spec.hpcg_problem_size[0], benchmark_spec.hpcg_problem_size[1], benchmark_spec.hpcg_problem_size[2]) return metadata
def Run(benchmark_spec): """Sets the GPU clock speed and runs the CUDA PCIe benchmark. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ vm = benchmark_spec.vms[0] num_iterations = FLAGS.gpu_pcie_bandwidth_iterations mode = FLAGS.gpu_pcie_bandwidth_mode transfer_size_range = FLAGS.gpu_pcie_bandwidth_transfer_sizes raw_results = [] metadata = {} metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['num_iterations'] = num_iterations metadata['mode'] = mode if mode == 'range': metadata['range_start'] = transfer_size_range[0] metadata['range_stop'] = transfer_size_range[1] metadata['range_step'] = transfer_size_range[2] run_command = ('%s/extras/demo_suite/bandwidthTest --device=all' % metadata['cuda_toolkit_home']) if mode == 'range': run_command += (' --mode=range --start={0} --end={1} --increment={2}' .format(transfer_size_range[0], transfer_size_range[1], transfer_size_range[2])) for i in range(num_iterations): stdout, _ = vm.RemoteCommand(run_command, should_log=True) raw_results.append(_ParseOutputFromSingleIteration(stdout)) if 'device_info' not in metadata: metadata['device_info'] = _ParseDeviceInfo(stdout) return _CalculateMetricsOverAllIterations(raw_results, metadata)