Beispiel #1
0
def test_create_run_official(mocker, kubernetes_api_client_node_port):
    mocker.patch('kubernetes.config.load_kube_config')
    rg = mocker.patch('concurrent.futures.ThreadPoolExecutor')
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.create_run("test_run",
                               5,
                               num_cpus=4.1,
                               max_bandwidth=10000,
                               image='PyTorch Cifar-10 ResNet-20 Open-MPI')

    assert result is not None
    assert result.result().json() == "a"
Beispiel #2
0
def test_create_run_custom(mocker, kubernetes_api_client_node_port):
    mocker.patch('kubernetes.config.load_kube_config')
    rg = mocker.patch('concurrent.futures.ThreadPoolExecutor')
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.create_run(
        "test_run",
        5,
        num_cpus=4.1,
        max_bandwidth=10000,
        custom_image_name="localhost:5000/mlbench_worker:latest",
        custom_image_command="/.openmpi/bin/mpirun /app/main.py",
        custom_image_all_nodes=False)

    assert result is not None
    assert result.result().json() == "a"
Beispiel #3
0
def run(name, num_workers, gpu, num_cpus, light, dashboard_url):
    """Start a new run for a benchmark image"""
    current_run_inputs = {}

    last_run_inputs_dir_location = os.path.join(os.environ["HOME"], ".local",
                                                "share", "mlbench")
    Path(last_run_inputs_dir_location).mkdir(parents=True, exist_ok=True)

    last_run_inputs_file_location = os.path.join(last_run_inputs_dir_location,
                                                 "last_run_inputs.pkl")

    try:
        last_run_inputs = pickle.load(open(last_run_inputs_file_location,
                                           "rb"))
    except FileNotFoundError as e:
        last_run_inputs = {}

    images = list(MLBENCH_IMAGES.keys())

    text_prompt = "Benchmark: \n\n"

    text_prompt += "\n".join("[{}]\t{}".format(i, t)
                             for i, t in enumerate(images))
    text_prompt += "\n[{}]\tCustom Image".format(len(images))

    text_prompt += "\n\nSelection"

    selection = click.prompt(
        text_prompt,
        type=click.IntRange(0, len(images)),
        default=last_run_inputs.get("benchmark", 0),
    )
    current_run_inputs["benchmark"] = selection

    if selection == len(images):
        # run custom image
        image = click.prompt("Image",
                             type=str,
                             default=last_run_inputs.get("image", None))
        current_run_inputs["image"] = image
        image_command = click.prompt("Command",
                                     type=str,
                                     default=last_run_inputs.get(
                                         "image_command", None))
        current_run_inputs["image_command"] = image_command
        benchmark = {
            "custom_image_name": image,
            "custom_image_command": image_command,
        }
    else:
        benchmark = {"image": images[selection]}

    # Backend Prompt
    text_prompt = "Backend: \n\n"
    text_prompt += "\n".join("[{}]\t{}".format(i, t)
                             for i, t in enumerate(MLBENCH_BACKENDS))
    text_prompt += "\n[{}]\tCustom Backend".format(len(MLBENCH_BACKENDS))
    text_prompt += "\n\nSelection"

    selection = click.prompt(
        text_prompt,
        type=click.IntRange(0, len(MLBENCH_BACKENDS)),
        default=last_run_inputs.get("backend", 0),
    )
    current_run_inputs["backend"] = selection

    if selection == len(MLBENCH_BACKENDS):
        backend = click.prompt("Backend",
                               type=str,
                               default=last_run_inputs.get(
                                   "custom_backend", None))
        current_run_inputs["custom_backend"] = backend
        run_on_all = click.confirm(
            "Run command on all nodes (otherwise just first node)",
            default=last_run_inputs.get("run_on_all", None),
        )
        current_run_inputs["run_on_all"] = run_on_all
        benchmark["custom_backend"] = backend
        benchmark["run_all_nodes"] = run_on_all
    else:
        benchmark["backend"] = MLBENCH_BACKENDS[selection]

    pickle.dump(current_run_inputs, open(last_run_inputs_file_location, "wb"))

    benchmark["gpu_enabled"] = gpu
    benchmark["light_target"] = light
    benchmark["num_cpus"] = num_cpus - 1

    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    results = []

    for num_w in num_workers:
        current_name = "{}-{}".format(name, num_w)

        res = client.create_run(current_name, num_w, **benchmark)
        results.append(res)

    for res in results:
        act_result = res.result()
        if act_result.status_code > 201:
            try:
                click.echo("Couldn't start run: {}".format(
                    act_result.json()["message"]))
            except json.JSONDecodeError:
                print(str(act_result.text))
                click.echo("Couldn't start run: Status {} for request".format(
                    act_result.status_code))
            return

        click.echo("Run started with name {}".format(
            act_result.json()["name"]))