Ejemplo n.º 1
0
def run(rank, size, run_id):
    """ Distributed Synchronous SGD Example """
    torch.manual_seed(1234)
    train_set, bsz = partition_dataset()
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    api_client = ApiClient(in_cluster=True,
                           k8s_namespace='default',
                           label_selector='component=master,app=mlbench')

    num_batches = ceil(len(train_set.dataset) / float(bsz))
    for epoch in range(10):
        epoch_loss = 0.0
        for data, target in train_set:
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            epoch_loss += loss.data.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
        logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch,
                      epoch_loss / num_batches)

        api_client.post_metric(run_id, "Rank {} loss".format(rank),
                               epoch_loss / num_batches)
Ejemplo n.º 2
0
def log_metrics(run_id, rank, epoch, metric_name, value, tracker=None, time=None):
    """ Log metrics to mlbench master/dashboard

    Args:
        run_id (str): The id of the current run
        rank (int): The rank of the current worker
        epoch (int): The current epoch
        metric_name (str): The name of the metric to save
        value (Any): The metric value
    """
    in_cluster = os.getenv("MLBENCH_IN_DOCKER") is None

    metric_name = "{} @ {}".format(metric_name, rank)

    if in_cluster:
        api = ApiClient()
        api.post_metric(
            run_id,
            metric_name,
            value,
            metadata="{{rank: {}, epoch:{}}}".format(rank, epoch),
        )

    if tracker and time:
        tracker.records.append(
            {
                "run_id": run_id,
                "name": metric_name,
                "cumulative": True,
                "date": str(datetime.datetime.now()),
                "time": str(time),
                "value": str(value),
                "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch),
            }
        )
Ejemplo n.º 3
0
def test_get_run(mocker, kubernetes_api_client_node_port):
    mocker.patch('kubernetes.config.load_kube_config')
    rg = mocker.patch('concurrent.futures.ThreadPoolExecutor')
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False, service_name="rel-mlbench-master")

    result = client.get_run("1")

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 4
0
def test_post_metrics(mocker, kubernetes_api_client_node_port):
    mocker.patch("kubernetes.config.load_kube_config")
    rg = mocker.patch("concurrent.futures.ThreadPoolExecutor")
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.post_metric("1", "loss", 10.0, cumulative=False)

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 5
0
def test_get_worker_pods(mocker, kubernetes_api_client_node_port):
    mocker.patch("kubernetes.config.load_kube_config")
    rg = mocker.patch("concurrent.futures.ThreadPoolExecutor")
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.get_worker_pods()

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 6
0
def status(name, dashboard_url):
    """Get the status of a benchmark run, or all runs if no name is given"""
    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    ret = client.get_runs()
    runs = ret.result().json()

    if name is None:  # List all runs
        for run in runs:
            del run["job_id"]
            del run["job_metadata"]

        click.echo(tabulate(runs, headers="keys"))
        return

    try:
        run = next(r for r in runs if r["name"] == name)
    except StopIteration:
        click.echo("Run not found")
        return

    del run["job_id"]
    del run["job_metadata"]

    click.echo(tabulate([run], headers="keys"))

    loss = client.get_run_metrics(run["id"],
                                  metric_filter="val_global_loss @ 0",
                                  last_n=1)
    prec = client.get_run_metrics(run["id"],
                                  metric_filter="val_global_Prec@1 @ 0",
                                  last_n=1)

    loss = loss.result()
    prec = prec.result()

    if loss.status_code < 300 and "val_global_loss @ 0" in loss.json():
        val = loss.json()["val_global_loss @ 0"][0]
        click.echo("Current Global Loss: {0:.2f} ({1})".format(
            float(val["value"]), val["date"]))
    else:
        click.echo("No Validation Loss Data yet")
    if prec.status_code < 300 and "val_global_Prec@1 @ 0" in prec.json():
        val = prec.json()["val_global_Prec@1 @ 0"][0]
        click.echo("Current Global Precision: {0:.2f} ({1})".format(
            float(val["value"]), val["date"]))
    else:
        click.echo("No Validation Precision Data yet")
Ejemplo n.º 7
0
def test_get_pod_metrics(mocker, kubernetes_api_client_node_port):
    mocker.patch("kubernetes.config.load_kube_config")
    rg = mocker.patch("concurrent.futures.ThreadPoolExecutor")
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.get_pod_metrics("rel-mlbench-worker-0",
                                    since=datetime.datetime.now(),
                                    summarize=100)

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 8
0
def test_create_run_official(mocker, kubernetes_api_client_node_port):
    mocker.patch('kubernetes.config.load_kube_config')
    rg = mocker.patch('concurrent.futures.ThreadPoolExecutor')
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.create_run("test_run",
                               5,
                               num_cpus=4.1,
                               max_bandwidth=10000,
                               image='PyTorch Cifar-10 ResNet-20 Open-MPI')

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 9
0
def test_instantiation_incluster(mocker, kubernetes_api_client_incluster):
    mocker.patch("kubernetes.config.load_incluster_config")

    client = ApiClient(in_cluster=True)

    assert client is not None
    assert client.endpoint == "http://1.1.1.1:80/api/"
Ejemplo n.º 10
0
def test_instantiation_loadbalancer(mocker,
                                    kubernetes_api_client_loadbalancer):
    mocker.patch('kubernetes.config.load_kube_config')
    client = ApiClient(in_cluster=False, service_name="rel-mlbench-master")

    assert client is not None
    assert client.endpoint == "http://1.1.1.1:12345/api/"
Ejemplo n.º 11
0
class LogMetrics(object):
    in_cluster = os.getenv("KUBERNETES_SERVICE_HOST") is not None

    if in_cluster:
        api = ApiClient()

    @staticmethod
    def log(run_id, rank, epoch, metric_name, value, tracker=None, time=None):
        if not LogMetrics.in_cluster:
            return

        metric_name = "{} @ {}".format(metric_name, rank)

        LogMetrics.api.post_metric(
            run_id,
            metric_name,
            value,
            metadata="{{rank: {}, epoch:{}}}".format(rank, epoch),
        )

        if tracker and time:
            tracker.records.append(
                {
                    "run_id": run_id,
                    "name": metric_name,
                    "cumulative": True,
                    "date": str(datetime.datetime.now()),
                    "time": str(time),
                    "value": str(value),
                    "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch),
                }
            )
Ejemplo n.º 12
0
def test_instantiation_nodeport_internal(
        mocker, kubernetes_api_client_node_port_internal):
    mocker.patch('kubernetes.config.load_kube_config')
    client = ApiClient(in_cluster=False)

    assert client is not None
    assert client.endpoint == "http://1.1.1.1:12345/api/"
Ejemplo n.º 13
0
def test_instantiation_loadbalancer(mocker,
                                    kubernetes_api_client_loadbalancer):
    mocker.patch("kubernetes.config.load_kube_config")
    client = ApiClient(in_cluster=False)

    assert client is not None
    assert client.endpoint == "http://1.1.1.1:12345/api/"
Ejemplo n.º 14
0
def download(name, output, dashboard_url):
    """Download the results of a benchmark run"""
    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    ret = client.get_runs()
    runs = ret.result().json()

    run = next(r for r in runs if r["name"] == name)

    ret = client.download_run_metrics(run["id"])

    with open(output, "wb") as f:
        f.write(ret.result().content)
Ejemplo n.º 15
0
def test_create_run_custom(mocker, kubernetes_api_client_node_port):
    mocker.patch('kubernetes.config.load_kube_config')
    rg = mocker.patch('concurrent.futures.ThreadPoolExecutor')
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.create_run(
        "test_run",
        5,
        num_cpus=4.1,
        max_bandwidth=10000,
        custom_image_name="localhost:5000/mlbench_worker:latest",
        custom_image_command="/.openmpi/bin/mpirun /app/main.py",
        custom_image_all_nodes=False)

    assert result is not None
    assert result.result().json() == "a"
Ejemplo n.º 16
0
class LogMetrics(object):
    """ Use to write metric values to the Dashboard API and to Trackers

    Caches API client for performance reasons
    """

    in_cluster = os.getenv('KUBERNETES_SERVICE_HOST') is not None

    if in_cluster:
        api = ApiClient()

    @staticmethod
    def log(run_id, rank, epoch, metric_name, value, tracker=None, time=None):
        """ Logs metrics to the Metrics API

        Currently only logs inside of a cluster

        Args:
            run_id (str): The id of the run in the dashboard
            rank (int): Rank of the current worker node
            epoch (float): The current epoch (fractional)
            metric_name (str): The name of the metric
            value (float / int / str): The metric value to write
            tracker(:obj:`mlbench_core.utils.Tracker`): The value Tracker
            time (float): The current time (used for Tracker)

        """

        if not LogMetrics.in_cluster:
            return

        metric_name = "{} @ {}".format(metric_name, rank)

        LogMetrics.api.post_metric(run_id,
                                   metric_name,
                                   value,
                                   metadata="{{rank: {}, epoch:{}}}".format(
                                       rank, epoch))

        if tracker and time:
            tracker.records.append({
                "run_id":
                run_id,
                "name":
                metric_name,
                "cumulative":
                True,
                "date":
                str(datetime.datetime.now()),
                "time":
                str(time),
                "value":
                str(value),
                "metadata":
                "{{rank: {}, epoch:{}}}".format(rank, epoch)
            })
Ejemplo n.º 17
0
def get_dashboard_url():
    """Returns the dashboard URL of the current cluster"""
    loaded = setup_client_from_config()

    if not loaded:
        click.echo("No Cluster config found")
        return

    client = ApiClient(in_cluster=False, load_config=False)

    click.echo(client.endpoint.replace('api/', ''))
Ejemplo n.º 18
0
def delete(name, dashboard_url):
    """Delete a benchmark run"""
    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    ret = client.get_runs()
    runs = ret.result().json()

    try:
        run = next(r for r in runs if r["name"] == name)
    except StopIteration:
        click.echo("Run not found")
        return

    del run["job_id"]
    del run["job_metadata"]

    client.delete_run(run["id"])
Ejemplo n.º 19
0
def status(name, dashboard_url):
    """Get the status of a benchmark run"""
    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False, url=dashboard_url,
                       load_config=not loaded)

    ret = client.get_runs()
    runs = ret.result().json()

    try:
        run = next(r for r in runs if r['name'] == name)
    except StopIteration:
        click.echo('Run not found')
        return

    del run['job_id']
    del run['job_metadata']

    click.echo(tabulate([run], headers='keys'))

    loss = client.get_run_metrics(run['id'], metric_filter='val_global_loss @ 0', last_n=1)
    prec = client.get_run_metrics(run['id'], metric_filter='val_global_Prec@1 @ 0', last_n=1)

    loss = loss.result()
    prec = prec.result()

    if loss.status_code < 300 and 'val_global_loss @ 0' in loss.json():
        val = loss.json()['val_global_loss @ 0'][0]
        click.echo("Current Global Loss: {0:.2f} ({1})".format(float(val['value']), val['date']))
    else:
        click.echo("No Validation Loss Data yet")
    if prec.status_code < 300 and 'val_global_Prec@1 @ 0' in prec.json():
        val = prec.json()['val_global_Prec@1 @ 0'][0]
        click.echo("Current Global Precision: {0:.2f} ({1})".format(float(val['value']), val['date']))
    else:
        click.echo("No Validation Precision Data yet")
Ejemplo n.º 20
0
def charts(folder, filter, dashboard_url):
    """Chart the results of benchmark runs

    Save generated charts in FOLDER
    """
    folder = Path(folder)
    if not folder.exists():
        folder.mkdir(parents=True)
    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    ret = client.get_runs()
    runs = ret.result().json()
    runs = [r for r in runs if r["state"] == "finished"]

    if filter:
        runs = [r for r in runs if filter in r["name"]]

    options = {i: r for i, r in enumerate(runs, start=0)}

    if len(options) < 2:
        click.echo("At least two finished runs are needed to create a summary")
        return

    options["all"] = {"name": "*all runs*"}

    prompt = 'Select the runs to generate a summary for (e.g. "0 1 2"): \n\t{}'.format(
        "\n\t".join("{} [{}]".format(r["name"], i)
                    for i, r in options.items()))

    choice = click.prompt(
        prompt,
        default=0,
        type=click.Choice([options.keys()]),
        show_choices=False,
        value_proc=lambda x: runs
        if "all" in x else [options[int(i)] for i in x.split(" ")],
    )

    if len(choice) < 2:
        click.echo("At least two finished runs are needed to create a summary")
        return

    results = []

    def _get_metric(name, run):
        """Gets a metric from the dashboard."""
        name = "global_cum_{} @ 0".format(name)
        return float(
            client.get_run_metrics(run["id"], metric_filter=name,
                                   last_n=1).result().json()[name][0]["value"])

    for run in choice:
        agg = _get_metric("agg", run)

        backprop = _get_metric("backprop", run)

        batch_load = _get_metric("batch_load", run)

        comp_loss = _get_metric("comp_loss", run)

        comp_metrics = _get_metric("comp_metrics", run)

        fwd_pass = _get_metric("fwd_pass", run)

        opt_step = _get_metric("opt_step", run)

        compute = (fwd_pass + comp_loss + backprop + opt_step +
                   (agg if run["num_workers"] == 1 else 0))
        communicate = agg if run["num_workers"] != 1 else 0

        results.append((
            run["name"],
            compute,
            communicate,
            comp_metrics,
            batch_load,
            str(run["num_workers"]),
        ))

    results = sorted(results, key=lambda x: x[5])
    names, compute, communicate, metrics, batch_load, num_workers = zip(
        *results)

    width = 0.35
    fig, ax = plt.subplots()

    ax.bar(num_workers, compute, width, label="Compute")
    ax.bar(num_workers, communicate, width, label="Communication")

    ax.set_ylabel("Time (s)")
    ax.set_title("Total time by number of workers")
    ax.legend()
    plt.savefig(folder / "total_time.png", dpi=150)

    fig, ax = plt.subplots()

    combined = [c + r for _, c, r, _, _, _ in results]

    speedup = [combined[0] / c for c in combined]

    ax.bar(num_workers, speedup, width)

    ax.set_ylabel("Speedup factor")
    ax.set_title("Speedup")
    plt.savefig(folder / "speedup.png", dpi=150)

    fig, ax = plt.subplots()

    ax.bar(num_workers, compute, width, label="Compute")
    ax.bar(num_workers, communicate, width, label="Communication")
    ax.bar(num_workers, metrics, width, label="Metrics Computation")
    ax.bar(num_workers, batch_load, width, label="Batch Loading")

    ax.set_ylabel("Time (s)")
    ax.set_title("Total time by number of workers")
    ax.legend()
    plt.savefig(folder / "time_for_all_phases.png", dpi=150)

    click.echo("Summary created in {}".format(folder))
Ejemplo n.º 21
0
def create_gcloud(num_workers, release, kubernetes_version, machine_type,
                  disk_size, num_cpus, num_gpus, gpu_type, zone, project,
                  preemptible, custom_value):
    from google.cloud import container_v1
    import google.auth
    from googleapiclient import discovery, http

    credentials, default_project = google.auth.default()

    if not project:
        project = default_project

    # create cluster
    gclient = container_v1.ClusterManagerClient()

    name = '{}-{}'.format(release, num_workers)
    name_path = 'projects/{}/locations/{}/'.format(project, zone)

    extraargs = {}

    if num_gpus > 0:
        extraargs['accelerators'] = [container_v1.types.AcceleratorConfig(
            accelerator_count=num_gpus, accelerator_type=gpu_type)]

    # delete existing firewall, if any
    firewalls = discovery.build(
        'compute', 'v1', cache_discovery=False).firewalls()

    existing_firewalls = firewalls.list(project=project).execute()
    fw_name = '{}-firewall'.format(name)

    if any(f['name'] == fw_name for f in existing_firewalls['items']):
        response = {}
        while not hasattr(response, 'status'):
            try:
                response = firewalls.delete(
                    project=project, firewall=fw_name).execute()
            except http.HttpError as e:
                if e.resp.status == 404:
                    response = {}
                    break
                click.echo("Wait for firewall to be available for deletion")
                sleep(5)
                response = {}
        while hasattr(response, 'status') and response.status < response.DONE:
            response = gclient.get_operation(
                None, None, None, name=response.selfLink)
            sleep(1)

    # create cluster
    cluster = container_v1.types.Cluster(
            name=name,
            initial_node_count=num_workers,
            node_config=container_v1.types.NodeConfig(
                machine_type=machine_type,
                disk_size_gb=disk_size,
                preemptible=preemptible,
                oauth_scopes=[
                    'https://www.googleapis.com/auth/devstorage.full_control',
                ],
                **extraargs
            ),
            addons_config=container_v1.types.AddonsConfig(
                http_load_balancing=container_v1.types.HttpLoadBalancing(
                    disabled=True,
                ),
                horizontal_pod_autoscaling=
                    container_v1.types.HorizontalPodAutoscaling(
                        disabled=True,
                    ),
                kubernetes_dashboard=container_v1.types.KubernetesDashboard(
                    disabled=True,
                ),
                network_policy_config=container_v1.types.NetworkPolicyConfig(
                    disabled=False,
                ),
            ),
            logging_service=None,
            monitoring_service=None
        )
    response = gclient.create_cluster(None, None, cluster, parent=name_path)

    # wait for cluster to load
    while response.status < response.DONE:
        response = gclient.get_operation(
            None, None, None, name=name_path + '/' + response.name)
        sleep(1)

    if response.status != response.DONE:
        raise ValueError('Cluster creation failed!')

    cluster = gclient.get_cluster(
        None, None, None, name=name_path + '/' + name)

    auth_req = google.auth.transport.requests.Request()
    credentials.refresh(auth_req)
    configuration = client.Configuration()
    configuration.host = f'https://{cluster.endpoint}:443'
    configuration.verify_ssl = False
    configuration.api_key = {'authorization': 'Bearer ' + credentials.token}
    client.Configuration.set_default(configuration)

    if num_gpus > 0:
        with request.urlopen(GCLOUD_NVIDIA_DAEMONSET) as r:
            dep = yaml.safe_load(r)
            dep['spec']['selector'] = {
                'matchLabels': dep['spec']['template']['metadata']['labels']
                }
            dep = client.ApiClient()._ApiClient__deserialize(dep, 'V1DaemonSet')
            k8s_client = client.AppsV1Api()
            k8s_client.create_namespaced_daemon_set('kube-system', body=dep)

    # create tiller service account
    client.CoreV1Api().create_namespaced_service_account(
        'kube-system',
        {
            'apiVersion': 'v1',
            'kind': 'ServiceAccount',
            'metadata': {
                'name': 'tiller',
                'generateName': 'tiller',
                'namespace': 'kube-system',
            },
        })

    client.RbacAuthorizationV1beta1Api().create_cluster_role_binding(
        {
            'apiVersion': 'rbac.authorization.k8s.io/v1beta1',
            'kind': 'ClusterRoleBinding',
            'metadata': {
                'name': 'tiller'
            },
            'roleRef': {
                'apiGroup': 'rbac.authorization.k8s.io',
                'kind': 'ClusterRole',
                'name': 'cluster-admin'
            },
            'subjects': [
                {
                    'kind': 'ServiceAccount',
                    'name': 'tiller',
                    'namespace': 'kube-system'
                }
            ]
        })

    # deploy tiller
    tiller_service = yaml.safe_load(TILLER_MANIFEST_SERVICE)
    tiller_dep = yaml.safe_load(TILLER_MANIFEST_DEPLOYMENT)
    client.CoreV1Api().create_namespaced_service(
        'kube-system',
        tiller_service)
    client.ExtensionsV1beta1Api().create_namespaced_deployment(
        'kube-system',
        tiller_dep)

    sleep(1)

    pods = client.CoreV1Api().list_namespaced_pod(
        namespace='kube-system',
        label_selector='app=helm'
    )

    tiller_pod = pods.items[0]

    while True:
        # Wait for tiller
        resp = client.CoreV1Api().read_namespaced_pod(
            namespace='kube-system',
            name=tiller_pod.metadata.name
        )
        if resp.status.phase != 'Pending':
            break
        sleep(5)

    # kubernetes python doesn't currently support port forward
    # https://github.com/kubernetes-client/python/issues/166
    ports = 44134

    # resp = stream(
    #     client.CoreV1Api().connect_get_namespaced_pod_portforward,
    #     name=tiller_pod.metadata.name,
    #     namespace=tiller_pod.metadata.namespace,
    #     ports=ports
    #     )

    with subprocess.Popen([
            'kubectl',
            'port-forward',
            '--namespace={}'.format(tiller_pod.metadata.namespace),
            tiller_pod.metadata.name, '{0}:{0}'.format(ports),
            '--server={}'.format(configuration.host),
            '--token={}'.format(credentials.token),
            '--insecure-skip-tls-verify=true']) as portforward:

        sleep(5)
        # install chart
        tiller = Tiller('localhost')
        chart = ChartBuilder(
            {
                'name': 'mlbench-helm',
                'source': {
                    'type': 'git',
                    'location': 'https://github.com/mlbench/mlbench-helm'
                }})

        values = {
            'limits': {
                'workers': num_workers - 1,
                'gpu': num_gpus,
                'cpu': num_cpus
            }
        }

        if custom_value:
            # merge custom values with values
            for cv in custom_value:
                key, v = cv.split("=", 1)

                current = values
                key_path = key.split(".")

                for k in key_path[:-1]:
                    if k not in current:
                        current[k] = {}

                    current = current[k]

                current[key_path[-1]] = v

        tiller.install_release(
            chart.get_helm_chart(),
            name=name,
            wait=True,
            dry_run=False,
            namespace='default',
            values=values)

        portforward.terminate()

    # open port in firewall
    mlbench_client = ApiClient(in_cluster=False, load_config=False)
    firewall_body = {
        "name": fw_name,
        "direction": "INGRESS",
        "sourceRanges": "0.0.0.0/0",
        "allowed": [
            {"IPProtocol": "tcp", "ports": [mlbench_client.port]}
        ]
    }

    firewalls.insert(project=project, body=firewall_body).execute()

    config = get_config()

    config.set('general', 'provider', 'gke')

    config.set('gke', 'cluster', cluster.endpoint)

    write_config(config)

    click.echo("MLBench successfully deployed")
Ejemplo n.º 22
0
def run(name, num_workers, gpu, light, dashboard_url):
    """Start a new run for a benchmark image"""
    images = list(MLBENCH_IMAGES.keys())

    text_prompt = 'Benchmark: \n\n'

    text_prompt += '\n'.join(
        '[{}]\t{}'.format(i, t) for i, t in enumerate(images)
    )
    text_prompt += '\n[{}]\tCustom Image'.format(len(images))

    text_prompt += '\n\nSelection'

    selection = click.prompt(
        text_prompt,
        type=click.IntRange(0, len(images)),
        default=0
    )

    if selection == len(images):
        # run custom image
        image = click.prompt('Image:', type=str)
        image_command = click.prompt('Command:', type=str)
        run_on_all = click.confirm(
            'Run command on all nodes (otherwise just first node):', type=bool)
        benchmark = {
            'custom_image_name': image,
            'custom_image_command': image_command,
            'custom_image_all_nodes': run_on_all
        }
    else:
        benchmark = {'image': images[selection]}

    benchmark['gpu_enabled'] = gpu
    benchmark['light_target'] = light

    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False, url=dashboard_url,
                       load_config=not loaded)

    results = []

    for num_w in num_workers:
        current_name = '{}-{}'.format(name, num_w)

        res = client.create_run(current_name, num_w, **benchmark)
        results.append(res)

    for res in results:
        act_result = res.result()
        if act_result.status_code > 201:
            try:
                click.echo('Couldn\'t start run: {}'.format(
                    act_result.json()['message']))
            except json.JSONDecodeError:
                print(str(act_result.text))
                click.echo(
                    'Couldn\'t start run: Status {} for request'.format(
                        act_result.status_code))
            return

        click.echo('Run started with name {}'.format(
            act_result.json()['name']))
Ejemplo n.º 23
0
def test_instantiation(mocker, kubernetes_api_client_node_port):
    mocker.patch("kubernetes.config.load_kube_config")
    with ApiClient(in_cluster=False) as client:
        assert client is not None
        assert client.endpoint == "http://1.1.1.1:12345/api/"
Ejemplo n.º 24
0
def run(name, num_workers, gpu, light, dashboard_url):
    """Start a new run for a benchmark image"""
    images = list(MLBENCH_IMAGES.keys())

    text_prompt = "Benchmark: \n\n"

    text_prompt += "\n".join("[{}]\t{}".format(i, t)
                             for i, t in enumerate(images))
    text_prompt += "\n[{}]\tCustom Image".format(len(images))

    text_prompt += "\n\nSelection"

    selection = click.prompt(text_prompt,
                             type=click.IntRange(0, len(images)),
                             default=0)

    if selection == len(images):
        # run custom image
        image = click.prompt("Image", type=str)
        image_command = click.prompt("Command", type=str)
        run_on_all = click.confirm(
            "Run command on all nodes (otherwise just first node)")
        benchmark = {
            "custom_image_name": image,
            "custom_image_command": image_command,
            "custom_image_all_nodes": run_on_all,
        }
    else:
        benchmark = {"image": images[selection]}

    benchmark["gpu_enabled"] = gpu
    benchmark["light_target"] = light

    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    results = []

    for num_w in num_workers:
        current_name = "{}-{}".format(name, num_w)

        res = client.create_run(current_name, num_w, **benchmark)
        results.append(res)

    for res in results:
        act_result = res.result()
        if act_result.status_code > 201:
            try:
                click.echo("Couldn't start run: {}".format(
                    act_result.json()["message"]))
            except json.JSONDecodeError:
                print(str(act_result.text))
                click.echo("Couldn't start run: Status {} for request".format(
                    act_result.status_code))
            return

        click.echo("Run started with name {}".format(
            act_result.json()["name"]))
Ejemplo n.º 25
0
def create_gcloud(
    num_workers,
    release,
    kubernetes_version,
    machine_type,
    disk_size,
    num_cpus,
    num_gpus,
    gpu_type,
    zone,
    project,
    preemptible,
    custom_value,
):
    from google.cloud import container_v1
    import google.auth
    from google.auth.exceptions import DefaultCredentialsError
    from googleapiclient import discovery, http

    try:
        credentials, default_project = google.auth.default()
    except DefaultCredentialsError:
        raise click.UsageError(
            "Couldn't find gcloud credentials. Install the gcloud"
            " sdk ( https://cloud.google.com/sdk/docs/quickstart-linux ) and "
            "run 'gcloud auth application-default login' to login and create "
            "your credentials.")

    assert num_workers >= 2, "Number of workers should be at least 2"

    if not project:
        project = default_project

    # create cluster
    gclient = container_v1.ClusterManagerClient()

    name = "{}-{}".format(release, num_workers)
    name_path = "projects/{}/locations/{}/".format(project, zone)

    extraargs = {}

    if num_gpus > 0:
        extraargs["accelerators"] = [
            container_v1.types.AcceleratorConfig(accelerator_count=num_gpus,
                                                 accelerator_type=gpu_type)
        ]

    # delete existing firewall, if any
    firewalls = discovery.build("compute", "v1",
                                cache_discovery=False).firewalls()

    existing_firewalls = firewalls.list(project=project).execute()
    fw_name = "{}-firewall".format(name)

    if any(f["name"] == fw_name for f in existing_firewalls["items"]):
        response = {}
        while not hasattr(response, "status"):
            try:
                response = firewalls.delete(project=project,
                                            firewall=fw_name).execute()
            except http.HttpError as e:
                if e.resp.status == 404:
                    response = {}
                    break
                click.echo("Wait for firewall to be available for deletion")
                sleep(5)
                response = {}
        while hasattr(response, "status") and response.status < response.DONE:
            response = gclient.get_operation(None,
                                             None,
                                             None,
                                             name=response.selfLink)
            sleep(1)

    # create cluster
    cluster = container_v1.types.Cluster(
        name=name,
        initial_node_count=num_workers,
        node_config=container_v1.types.NodeConfig(
            machine_type=machine_type,
            disk_size_gb=disk_size,
            preemptible=preemptible,
            oauth_scopes=[
                "https://www.googleapis.com/auth/devstorage.full_control",
            ],
            **extraargs,
        ),
        addons_config=container_v1.types.AddonsConfig(
            http_load_balancing=container_v1.types.HttpLoadBalancing(
                disabled=True, ),
            horizontal_pod_autoscaling=container_v1.types.
            HorizontalPodAutoscaling(disabled=True, ),
            kubernetes_dashboard=container_v1.types.KubernetesDashboard(
                disabled=True, ),
            network_policy_config=container_v1.types.NetworkPolicyConfig(
                disabled=False, ),
        ),
        logging_service=None,
        monitoring_service=None,
    )
    response = gclient.create_cluster(None, None, cluster, parent=name_path)

    # wait for cluster to load
    while response.status < response.DONE:
        response = gclient.get_operation(None,
                                         None,
                                         None,
                                         name=name_path + "/" + response.name)
        sleep(1)

    if response.status != response.DONE:
        raise ValueError("Cluster creation failed!")

    cluster = gclient.get_cluster(None,
                                  None,
                                  None,
                                  name=name_path + "/" + name)

    auth_req = google.auth.transport.requests.Request()
    credentials.refresh(auth_req)
    configuration = client.Configuration()
    configuration.host = f"https://{cluster.endpoint}:443"
    configuration.verify_ssl = False
    configuration.api_key = {"authorization": "Bearer " + credentials.token}
    client.Configuration.set_default(configuration)

    if num_gpus > 0:
        with request.urlopen(GCLOUD_NVIDIA_DAEMONSET) as r:
            dep = yaml.safe_load(r)
            dep["spec"]["selector"] = {
                "matchLabels": dep["spec"]["template"]["metadata"]["labels"]
            }
            dep = client.ApiClient()._ApiClient__deserialize(
                dep, "V1DaemonSet")
            k8s_client = client.AppsV1Api()
            k8s_client.create_namespaced_daemon_set("kube-system", body=dep)

    # create tiller service account
    client.CoreV1Api().create_namespaced_service_account(
        "kube-system",
        {
            "apiVersion": "v1",
            "kind": "ServiceAccount",
            "metadata": {
                "name": "tiller",
                "generateName": "tiller",
                "namespace": "kube-system",
            },
        },
    )

    client.RbacAuthorizationV1beta1Api().create_cluster_role_binding({
        "apiVersion":
        "rbac.authorization.k8s.io/v1beta1",
        "kind":
        "ClusterRoleBinding",
        "metadata": {
            "name": "tiller"
        },
        "roleRef": {
            "apiGroup": "rbac.authorization.k8s.io",
            "kind": "ClusterRole",
            "name": "cluster-admin",
        },
        "subjects": [{
            "kind": "ServiceAccount",
            "name": "tiller",
            "namespace": "kube-system"
        }],
    })

    # deploy tiller
    tiller_service = yaml.safe_load(TILLER_MANIFEST_SERVICE)
    tiller_dep = yaml.safe_load(TILLER_MANIFEST_DEPLOYMENT)
    client.CoreV1Api().create_namespaced_service("kube-system", tiller_service)
    client.ExtensionsV1beta1Api().create_namespaced_deployment(
        "kube-system", tiller_dep)

    sleep(1)

    pods = client.CoreV1Api().list_namespaced_pod(namespace="kube-system",
                                                  label_selector="app=helm")

    tiller_pod = pods.items[0]

    while True:
        # Wait for tiller
        resp = client.CoreV1Api().read_namespaced_pod(
            namespace="kube-system", name=tiller_pod.metadata.name)
        if resp.status.phase != "Pending":
            break
        sleep(5)

    # kubernetes python doesn't currently support port forward
    # https://github.com/kubernetes-client/python/issues/166
    ports = 44134

    # resp = stream(
    #     client.CoreV1Api().connect_get_namespaced_pod_portforward,
    #     name=tiller_pod.metadata.name,
    #     namespace=tiller_pod.metadata.namespace,
    #     ports=ports
    #     )

    with subprocess.Popen([
            "kubectl",
            "port-forward",
            "--namespace={}".format(tiller_pod.metadata.namespace),
            tiller_pod.metadata.name,
            "{0}:{0}".format(ports),
            "--server={}".format(configuration.host),
            "--token={}".format(credentials.token),
            "--insecure-skip-tls-verify=true",
    ]) as portforward:

        sleep(5)
        # install chart
        tiller = Tiller("localhost")
        chart = ChartBuilder({
            "name": "mlbench-helm",
            "source": {
                "type": "git",
                "location": "https://github.com/mlbench/mlbench-helm",
            },
        })

        values = {
            "limits": {
                "workers": num_workers - 1,
                "gpu": num_gpus,
                "cpu": num_cpus
            }
        }

        if custom_value:
            # merge custom values with values
            for cv in custom_value:
                key, v = cv.split("=", 1)

                current = values
                key_path = key.split(".")

                for k in key_path[:-1]:
                    if k not in current:
                        current[k] = {}

                    current = current[k]

                current[key_path[-1]] = v

        tiller.install_release(
            chart.get_helm_chart(),
            name=name,
            wait=True,
            dry_run=False,
            namespace="default",
            values=values,
        )

        portforward.terminate()

    # open port in firewall
    mlbench_client = ApiClient(in_cluster=False, load_config=False)
    firewall_body = {
        "name": fw_name,
        "direction": "INGRESS",
        "sourceRanges": "0.0.0.0/0",
        "allowed": [{
            "IPProtocol": "tcp",
            "ports": [mlbench_client.port]
        }],
    }

    firewalls.insert(project=project, body=firewall_body).execute()

    config = get_config()

    config.set("general", "provider", "gke")

    config.set("gke", "cluster", cluster.endpoint)

    write_config(config)

    click.echo("MLBench successfully deployed")
Ejemplo n.º 26
0
def run(name, num_workers, gpu, num_cpus, light, dashboard_url):
    """Start a new run for a benchmark image"""
    current_run_inputs = {}

    last_run_inputs_dir_location = os.path.join(os.environ["HOME"], ".local",
                                                "share", "mlbench")
    Path(last_run_inputs_dir_location).mkdir(parents=True, exist_ok=True)

    last_run_inputs_file_location = os.path.join(last_run_inputs_dir_location,
                                                 "last_run_inputs.pkl")

    try:
        last_run_inputs = pickle.load(open(last_run_inputs_file_location,
                                           "rb"))
    except FileNotFoundError as e:
        last_run_inputs = {}

    images = list(MLBENCH_IMAGES.keys())

    text_prompt = "Benchmark: \n\n"

    text_prompt += "\n".join("[{}]\t{}".format(i, t)
                             for i, t in enumerate(images))
    text_prompt += "\n[{}]\tCustom Image".format(len(images))

    text_prompt += "\n\nSelection"

    selection = click.prompt(
        text_prompt,
        type=click.IntRange(0, len(images)),
        default=last_run_inputs.get("benchmark", 0),
    )
    current_run_inputs["benchmark"] = selection

    if selection == len(images):
        # run custom image
        image = click.prompt("Image",
                             type=str,
                             default=last_run_inputs.get("image", None))
        current_run_inputs["image"] = image
        image_command = click.prompt("Command",
                                     type=str,
                                     default=last_run_inputs.get(
                                         "image_command", None))
        current_run_inputs["image_command"] = image_command
        benchmark = {
            "custom_image_name": image,
            "custom_image_command": image_command,
        }
    else:
        benchmark = {"image": images[selection]}

    # Backend Prompt
    text_prompt = "Backend: \n\n"
    text_prompt += "\n".join("[{}]\t{}".format(i, t)
                             for i, t in enumerate(MLBENCH_BACKENDS))
    text_prompt += "\n[{}]\tCustom Backend".format(len(MLBENCH_BACKENDS))
    text_prompt += "\n\nSelection"

    selection = click.prompt(
        text_prompt,
        type=click.IntRange(0, len(MLBENCH_BACKENDS)),
        default=last_run_inputs.get("backend", 0),
    )
    current_run_inputs["backend"] = selection

    if selection == len(MLBENCH_BACKENDS):
        backend = click.prompt("Backend",
                               type=str,
                               default=last_run_inputs.get(
                                   "custom_backend", None))
        current_run_inputs["custom_backend"] = backend
        run_on_all = click.confirm(
            "Run command on all nodes (otherwise just first node)",
            default=last_run_inputs.get("run_on_all", None),
        )
        current_run_inputs["run_on_all"] = run_on_all
        benchmark["custom_backend"] = backend
        benchmark["run_all_nodes"] = run_on_all
    else:
        benchmark["backend"] = MLBENCH_BACKENDS[selection]

    pickle.dump(current_run_inputs, open(last_run_inputs_file_location, "wb"))

    benchmark["gpu_enabled"] = gpu
    benchmark["light_target"] = light
    benchmark["num_cpus"] = num_cpus - 1

    loaded = setup_client_from_config()

    client = ApiClient(in_cluster=False,
                       url=dashboard_url,
                       load_config=not loaded)

    results = []

    for num_w in num_workers:
        current_name = "{}-{}".format(name, num_w)

        res = client.create_run(current_name, num_w, **benchmark)
        results.append(res)

    for res in results:
        act_result = res.result()
        if act_result.status_code > 201:
            try:
                click.echo("Couldn't start run: {}".format(
                    act_result.json()["message"]))
            except json.JSONDecodeError:
                print(str(act_result.text))
                click.echo("Couldn't start run: Status {} for request".format(
                    act_result.status_code))
            return

        click.echo("Run started with name {}".format(
            act_result.json()["name"]))
Ejemplo n.º 27
0
def test_instantiation_url():
    client = ApiClient(url="1.1.1.1:12345")

    assert client is not None
    assert client.endpoint == "http://1.1.1.1:12345/api/"
Ejemplo n.º 28
0
def create_gcloud(
    num_workers,
    release,
    kubernetes_version,
    machine_type,
    disk_size,
    num_cpus,
    num_gpus,
    gpu_type,
    zone,
    project,
    preemptible,
    custom_value,
    chart_location,
):
    import google.auth
    from google.auth.exceptions import DefaultCredentialsError

    try:
        credentials, default_project = google.auth.default()
    except DefaultCredentialsError:
        raise click.UsageError(
            "Couldn't find gcloud credentials. Install the gcloud"
            " sdk ( https://cloud.google.com/sdk/docs/quickstart-linux ) and "
            "run 'gcloud auth application-default login' to login and create "
            "your credentials.")

    if not project:
        project = default_project

    name = "{}-{}".format(release, num_workers)
    name_path = "projects/{}/locations/{}/clusters/".format(project, zone)

    click.echo("Creating Cluster")

    gclient, fw_name, firewalls = gcloud_create_cluster(
        name=name,
        name_path=name_path,
        num_workers=num_workers,
        num_gpus=num_gpus,
        gpu_type=gpu_type,
        machine_type=machine_type,
        disk_size=disk_size,
        preemptible=preemptible,
        kubernetes_version=kubernetes_version,
        project=project,
    )

    cluster = gclient.get_cluster(None,
                                  None,
                                  None,
                                  name=os.path.join(name_path, name))
    kube_context = setup_gcloud_kube_client(cluster.endpoint, cluster.name,
                                            cluster.zone, project)

    if num_gpus > 0:
        deploy_nvidia_daemonset()

    custom_chart = {
        "name": "mlbench-helm",
        "source": {
            "type":
            "git" if chart_location is None else "directory",
            "location":
            "https://github.com/mlbench/mlbench-helm"
            if chart_location is None else chart_location,
        },
    }

    click.echo("Deploying chart")
    deploy_chart(
        num_workers=num_workers - 1,
        num_gpus=num_gpus,
        num_cpus=num_cpus - 1,
        release_name=name,
        custom_value=custom_value,
        custom_chart=custom_chart,
        kube_context=kube_context,
    )

    # open port in firewall
    mlbench_client = ApiClient(in_cluster=False, load_config=False)
    firewall_body = {
        "name": fw_name,
        "direction": "INGRESS",
        "sourceRanges": "0.0.0.0/0",
        "allowed": [{
            "IPProtocol": "tcp",
            "ports": [mlbench_client.port]
        }],
    }

    firewalls.insert(project=project, body=firewall_body).execute()

    add_gcloud_cluster(name, cluster, project)

    click.echo("MLBench successfully deployed")
Ejemplo n.º 29
0
def create_aws(
    num_workers,
    release,
    kubernetes_version,
    machine_type,
    num_cpus,
    num_gpus,
    custom_value,
    ami_id,
    ssh_key,
):
    sts = boto3.client("sts")
    try:
        sts.get_caller_identity()
    except botocore.exceptions.ClientError:
        raise click.UsageError(
            "Couldn't find aws credentials. Install the aws"
            " sdk ( https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2-linux.html ) and "
            "run 'aws configure' to login and create "
            "your credentials.")
    name = "{}-{}".format(release, num_workers)
    nodeGroupName = name + "-node-group"

    kube_context, cf_client, stackName, cluster = aws_create_cluster(
        name,
        nodeGroupName,
        num_workers,
        machine_type,
        ssh_key,
        ami_id,
        kubernetes_version,
    )
    kube_config.load_kube_config(context=kube_context)

    if num_gpus > 0:
        deploy_nvidia_daemonset_aws()

    deploy_chart(
        num_workers=num_workers - 1,
        num_gpus=num_gpus,
        num_cpus=num_cpus - 1,
        release_name=name,
        custom_value=custom_value,
        kube_context=kube_context,
    )

    # open port in firewall
    mlbench_client = ApiClient(in_cluster=False, load_config=False)
    mlbench_port = mlbench_client.port
    r = cf_client.describe_stack_resources(
        StackName=stackName, LogicalResourceId="NodeSecurityGroup")
    secGroupId = r["StackResources"][0]["PhysicalResourceId"]

    ec2 = boto3.client("ec2")
    ec2.authorize_security_group_ingress(
        GroupId=secGroupId,
        IpPermissions=[
            {
                "FromPort": mlbench_port,
                "IpProtocol": "tcp",
                "IpRanges": [
                    {
                        "CidrIp": "0.0.0.0/0",
                    },
                ],
                "ToPort": mlbench_port,
            },
        ],
    )
    add_aws_cluster(name, cluster)
    click.echo("MLBench successfully deployed")