Beispiel #1
0
def _SetAndConfirmGpuClocks(vm):
    """Sets and confirms the GPU clock speed.

  The clock values are provided in the gpu_pcie_bandwidth_clock_speeds
  flag. If a device is queried and its clock speed does not allign with
  what it was just set to, an expection will be raised.

  Args:
    vm: the virtual machine to operate on.

  Raises:
    UnsupportedClockSpeedException if a GPU did not accept the
    provided clock speeds.
  """
    desired_memory_clock = FLAGS.gpu_pcie_bandwidth_clock_speeds[0]
    desired_graphics_clock = FLAGS.gpu_pcie_bandwidth_clock_speeds[1]
    cuda_toolkit_8.SetGpuClockSpeed(vm, desired_memory_clock,
                                    desired_graphics_clock)
    num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm)
    for i in range(num_gpus):
        if cuda_toolkit_8.QueryGpuClockSpeed(
                vm, i) != (desired_memory_clock, desired_graphics_clock):
            raise UnsupportedClockSpeedException(
                'Unrecoverable error setting '
                'GPU #{} clock speed to {},{}'.format(i, desired_memory_clock,
                                                      desired_graphics_clock))
def Run(benchmark_spec):
    """Run TensorFlow on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    master_vm = vms[0]
    tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks'
    tf_cnn_benchmark_cmd = (
        'python tf_cnn_benchmarks.py --local_parameter_device=%s '
        '--batch_size=%s --model=%s --data_name=%s --variable_update=%s '
        '--use_nccl=%s --distortions=%s --device=%s --data_format=%s '
        '--forward_only=%s') % (
            benchmark_spec.local_parameter_device, benchmark_spec.batch_size,
            benchmark_spec.model, benchmark_spec.data_name,
            benchmark_spec.variable_update, benchmark_spec.use_nccl,
            benchmark_spec.distortions, benchmark_spec.device,
            benchmark_spec.data_format, benchmark_spec.forward_only)
    if benchmark_spec.device == GPU:
        benchmark_spec.num_gpus = cuda_toolkit_8.QueryNumberOfGpus(master_vm)
        tf_cnn_benchmark_cmd = '%s %s --num_gpus=%s' % (_GetEnvironmentVars(
            master_vm), tf_cnn_benchmark_cmd, benchmark_spec.num_gpus)
    run_command = 'cd %s && %s' % (tf_cnn_benchmark_dir, tf_cnn_benchmark_cmd)
    output, _ = master_vm.RobustRemoteCommand(run_command, should_log=True)
    return _MakeSamplesFromOutput(benchmark_spec, output)
Beispiel #3
0
def Run(benchmark_spec):
    """Sets the GPU clock speed and runs the CUDA PCIe benchmark.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    vm = benchmark_spec.vms[0]
    # Note:  The clock speed is set in this function rather than Prepare()
    # so that the user can perform multiple runs with a specified
    # clock speed without having to re-prepare the VM.
    cuda_toolkit_8.SetAndConfirmGpuClocks(vm)
    num_iterations = FLAGS.gpu_pcie_bandwidth_iterations
    raw_results = []
    metadata = {}
    metadata.update(cuda_toolkit_8.GetMetadataFromFlags())
    metadata['num_iterations'] = num_iterations
    metadata['num_gpus'] = cuda_toolkit_8.QueryNumberOfGpus(vm)
    run_command = ('%s/extras/demo_suite/bandwidthTest --device=all' %
                   cuda_toolkit_8.CUDA_TOOLKIT_INSTALL_DIR)
    for i in range(num_iterations):
        stdout, _ = vm.RemoteCommand(run_command, should_log=True)
        raw_results.append(_ParseOutputFromSingleIteration(stdout))
        if 'device_info' not in metadata:
            metadata['device_info'] = _ParseDeviceInfo(stdout)
    return _CalculateMetricsOverAllIterations(raw_results, metadata)
Beispiel #4
0
def Prepare(benchmark_spec):
    """Install SHOC and push the machinefile.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """
    vm_util.RunThreaded(_InstallAndAuthenticateVm, benchmark_spec.vms)

    master_vm = benchmark_spec.vms[0]
    benchmark_spec.num_gpus = cuda_toolkit_8.QueryNumberOfGpus(master_vm)
    _CreateAndPushMachineFile(benchmark_spec.vms, benchmark_spec.num_gpus)
Beispiel #5
0
def Prepare(benchmark_spec):
  """Install and set up TensorFlow on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vms = benchmark_spec.vms
  master_vm = vms[0]
  logging.info('Installing CUDA Toolkit 8.0 on %s', master_vm)
  master_vm.Install('cuda_toolkit_8')
  benchmark_spec.num_gpus = cuda_toolkit_8.QueryNumberOfGpus(master_vm)
  master_vm.Install('cudnn')
  master_vm.Install('tensorflow')
Beispiel #6
0
def AssertCorrectNumberOfGpus(vm):
    """Assert that the VM is reporting the correct number of GPUs.

  Returns: number of GPUs on the VM

  Raises:
    Exception: if VM reports incorrect number of GPUs
  """

    expected_num_gpus = num_gpus_map_util.gpus_per_vm[vm.machine_type]
    actual_num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm)
    if actual_num_gpus != expected_num_gpus:
        raise Exception(
            'VM reported incorrect number of GPUs. ',
            'Expected %s, received %s' % (expected_num_gpus, actual_num_gpus))
    return actual_num_gpus
Beispiel #7
0
def Prepare(benchmark_spec):
    """Install and set up TensorFlow on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vms = benchmark_spec.vms
    master_vm = vms[0]
    logging.info('Installing CUDA Toolkit 8.0 on %s', master_vm)
    master_vm.Install('cuda_toolkit_8')
    benchmark_spec.num_gpus = cuda_toolkit_8.QueryNumberOfGpus(master_vm)
    master_vm.Install('cudnn')
    master_vm.Install('tensorflow')
    master_vm.RemoteCommand(
        'git clone https://github.com/tensorflow/benchmarks.git',
        should_log=True)
Beispiel #8
0
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
    """Update the benchmark_spec with supplied command line flags.

  Args:
    benchmark_spec: benchmark specification to update
  """
    gpus_per_node = (FLAGS.hpcg_gpus_per_node or
                     cuda_toolkit_8.QueryNumberOfGpus(benchmark_spec.vms[0]))
    cpus_per_rank = int(benchmark_spec.vms[0].num_cpus / gpus_per_node)
    num_vms = len(benchmark_spec.vms)
    total_gpus = gpus_per_node * num_vms

    benchmark_spec.gpus_per_node = gpus_per_node
    benchmark_spec.cpus_per_rank = cpus_per_rank
    benchmark_spec.num_vms = num_vms
    benchmark_spec.total_gpus = total_gpus
    benchmark_spec.hpcg_problem_size = FLAGS.hpcg_problem_size
    benchmark_spec.hpcg_runtime = FLAGS.hpcg_runtime
Beispiel #9
0
def _RunOnVm(vm, benchmark_spec):
    """Runs a TensorFlow benchmark on a single VM.

  Args:
    vm: VM to run on
    benchmark_spec: benchmark_spec object

  Returns:
    A list of samples
  """
    tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks'

    results = []
    for model in FLAGS.tf_models:
        batch_size = _GetBatchSize(model)
        tf_cnn_benchmark_cmd = (
            'python tf_cnn_benchmarks.py --local_parameter_device=%s '
            '--batch_size=%s --model=%s --data_name=%s --variable_update=%s '
            '--use_nccl=%s --distortions=%s --device=%s --data_format=%s '
            '--forward_only=%s') % (
                benchmark_spec.local_parameter_device, batch_size, model,
                benchmark_spec.data_name, benchmark_spec.variable_update,
                benchmark_spec.use_nccl, benchmark_spec.distortions,
                benchmark_spec.device, benchmark_spec.data_format,
                benchmark_spec.forward_only)
        if benchmark_spec.device == GPU:
            num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm)
            tf_cnn_benchmark_cmd = '%s %s --num_gpus=%s' % (
                tensorflow._GetEnvironmentVars(vm), tf_cnn_benchmark_cmd,
                num_gpus)
        else:
            num_gpus = 0
        run_command = 'cd %s && %s' % (tf_cnn_benchmark_dir,
                                       tf_cnn_benchmark_cmd)
        output, _ = vm.RobustRemoteCommand(run_command, should_log=True)
        results.extend(
            _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size,
                                   num_gpus))

    return results
Beispiel #10
0
def Run(benchmark_spec):
    """Run MXNet on the cluster for each model specified.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    mx_benchmark_dir = 'incubator-mxnet/example/image-classification'
    results = []
    for model in FLAGS.mx_models:
        num_layers = _GetNumLayers(model)
        batch_size = _GetBatchSize(model, num_layers)
        benchmark_spec.model = model
        benchmark_spec.batch_size = batch_size
        mx_benchmark_cmd = (
            'python train_imagenet.py --benchmark 1 --network %s --batch-size %s '
            '--image-shape %s --num-epochs %s --kv-store device') % (
                model, batch_size, IMAGENET_SHAPE, benchmark_spec.num_epochs)
        if benchmark_spec.device == GPU:
            gpus = cuda_toolkit_8.QueryNumberOfGpus(vm)
            mx_benchmark_cmd = '%s %s --gpus %s' % (mxnet.GetEnvironmentVars(
                vm), mx_benchmark_cmd, ','.join(str(n) for n in range(gpus)))
        if num_layers:
            mx_benchmark_cmd = '%s --num-layers %s' % (mx_benchmark_cmd,
                                                       num_layers)
            benchmark_spec.num_layers = num_layers
        run_command = 'cd %s && %s' % (mx_benchmark_dir, mx_benchmark_cmd)
        stdout, stderr = vm.RobustRemoteCommand(run_command, should_log=True)

        results.append(_MakeSamplesFromOutput(benchmark_spec, stdout
                                              or stderr))

    return results
 def testQueryNumberOfGpus(self):
     vm = mock.MagicMock()
     vm.RemoteCommand = mock.MagicMock(return_value=("count\n8", None))
     self.assertEqual(8, cuda_toolkit_8.QueryNumberOfGpus(vm))