Example #1
0
def AddUser(vm: virtual_machine.BaseVirtualMachine) -> None:
  """Run Docker as a non-root user.

  https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user

  Args:
    vm: The VM to work on
  """
  # Create the docker group.
  vm.RemoteCommand('sudo groupadd docker', ignore_failure=True)
  # Add your user to the docker group.
  vm.RemoteCommand(f'sudo usermod -aG docker {vm.user_name}')
  # Log out and log back in so that your group membership is re-evaluated.
  vm.RemoteCommand(f'pkill -KILL -u {vm.user_name}', ignore_failure=True)
Example #2
0
def _CollectGpuSamples(
        vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]:
    """Run XGBoost on the cluster.

  Args:
    vm: The virtual machine to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    cmd = [
        f'{FLAGS.xgboost_env}',
        'python3',
        f'{linux_packages.INSTALL_DIR}/xgboost/tests/benchmark/benchmark_tree.py',
        f'--tree_method={_TREE_METHOD.value}',
        f'--sparsity={_SPARSITY.value}',
        f'--rows={_ROWS.value}',
        f'--columns={_COLUMNS.value}',
        f'--iterations={_ITERATIONS.value}',
        f'--test_size={_TEST_SIZE.value}',
    ]
    if _PARAMS.value:
        cmd.append(f'--params="{_PARAMS.value}"')
    metadata = _MetadataFromFlags(vm)
    metadata.update(cuda_toolkit.GetMetadata(vm))
    metadata['command'] = ' '.join(cmd)

    stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode(
        metadata['command'], ignore_failure=True)
    if exit_code:
        logging.warning('Error with getting XGBoost stats: %s', stderr)
    training_time = regex_util.ExtractFloat(r'Train Time: ([\d\.]+) seconds',
                                            stdout)
    return sample.Sample('training_time', training_time, 'seconds', metadata)
def _CollectGpuSamples(
        vm: virtual_machine.BaseVirtualMachine) -> List[sample.Sample]:
    """Run CUDA memcopy on the cluster.

  Args:
    vm: The virtual machine to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    if not nvidia_driver.CheckNvidiaGpuExists(vm):
        return []
    if not nvidia_driver.CheckNvidiaSmiExists(vm):
        return []
    global_metadata = _MetadataFromFlags()
    global_metadata.update(cuda_toolkit.GetMetadata(vm))
    global_cmd = [
        BANDWIDTH_TEST_PATH, '--csv', f'--memory={_MEMORY.value}',
        f'--mode={_MODE.value}'
    ]
    if _HTOD.value:
        global_cmd.append('--htod')
    if _DTOH.value:
        global_cmd.append('--dtoh')
    if _DTOD.value:
        global_cmd.append('--dtod')
    if _WC.value:
        global_cmd.append('--wc')

    num_gpus = nvidia_driver.QueryNumberOfGpus(vm)
    devices = list(range(num_gpus)) + (['all'] if num_gpus > 1 else [])
    samples = []
    for device in devices:
        cmd = ' '.join(global_cmd + [f'--device={device}'])
        stdout, stderr, exit_code = vm.RemoteCommandWithReturnCode(
            cmd, ignore_failure=True)
        if exit_code:
            logging.warning('Error with getting GPU stats: %s', stderr)
            continue
        results = regex_util.ExtractAllMatches(
            r'bandwidthTest-(\S+), '
            r'Bandwidth = ([\d\.]+) (\S+), '
            r'Time = ([\d\.]+) s, '
            r'Size = (\d+) bytes, '
            r'NumDevsUsed = (\d+)', stdout)

        for metric, bandwidth, unit, time, size, num_devs_used in results:
            metadata = {
                'time': float(time),
                'size': int(size),
                'NumDevsUsed': num_devs_used,
                'device': device,
                'command': cmd,
            }
            metadata.update(global_metadata)
            samples.append(
                sample.Sample(metric, float(bandwidth), unit, metadata))
    return samples
def _RunGpuPingpong(vm: virtual_machine.BaseVirtualMachine,
                    addr: str) -> List[Tuple[float, float]]:
    """Returns the Ping and Pong latency times."""
    stdout, stderr = vm.RemoteCommand(
        f'{_ENV.value} python {_TEST_SCRIPT} {addr}')
    ping_bws = [
        float(bw)
        for bw in regex_util.ExtractAllMatches(_TIMELINE_PING, stdout + stderr)
    ]
    pong_bws = [
        float(bw)
        for bw in regex_util.ExtractAllMatches(_TIMELINE_PONG, stdout + stderr)
    ]

    return list(zip(ping_bws, pong_bws))
def ParseCsvResultsIntoMetadata(vm: virtual_machine.BaseVirtualMachine,
                                path: str) -> List[Dict[str, Any]]:
  """Loads the CSV created by cloud harmony at path in the VM into metadata.

  The CSV located by path inside of virtual machine VM will be loaded. For each
  row of results, a set of key/value pairs is created. The keys will all be
  prepended with `cloudharmony` or similar.

  Args:
     vm: the Virtual Machine that has run a cloud harmony benchmark
     path: The path inside of VM which has the CSV file which should be loaded
  Returns:
     A list of metadata outputs that should be appended to the samples that are
     produced by a cloud harmony benchmark.
  """
  csv_string, _ = vm.RemoteCommand('cat {path}'.format(path=path))

  return ParseCsvResultsFromString(csv_string)