def test_instantiation_nodeport_internal( mocker, kubernetes_api_client_node_port_internal): mocker.patch('kubernetes.config.load_kube_config') client = ApiClient(in_cluster=False) assert client is not None assert client.endpoint == "http://1.1.1.1:12345/api/"
def run(rank, size, run_id): """ Distributed Synchronous SGD Example """ torch.manual_seed(1234) train_set, bsz = partition_dataset() model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) api_client = ApiClient(in_cluster=True, k8s_namespace='default', label_selector='component=master,app=mlbench') num_batches = ceil(len(train_set.dataset) / float(bsz)) for epoch in range(10): epoch_loss = 0.0 for data, target in train_set: optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) epoch_loss += loss.data.item() loss.backward() average_gradients(model) optimizer.step() logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch, epoch_loss / num_batches) api_client.post_metric(run_id, "Rank {} loss".format(rank), epoch_loss / num_batches)
def log_metrics(run_id, rank, epoch, metric_name, value, tracker=None, time=None): """ Log metrics to mlbench master/dashboard Args: run_id (str): The id of the current run rank (int): The rank of the current worker epoch (int): The current epoch metric_name (str): The name of the metric to save value (Any): The metric value """ in_cluster = os.getenv("MLBENCH_IN_DOCKER") is None metric_name = "{} @ {}".format(metric_name, rank) if in_cluster: api = ApiClient() api.post_metric( run_id, metric_name, value, metadata="{{rank: {}, epoch:{}}}".format(rank, epoch), ) if tracker and time: tracker.records.append( { "run_id": run_id, "name": metric_name, "cumulative": True, "date": str(datetime.datetime.now()), "time": str(time), "value": str(value), "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch), } )
class LogMetrics(object): in_cluster = os.getenv("KUBERNETES_SERVICE_HOST") is not None if in_cluster: api = ApiClient() @staticmethod def log(run_id, rank, epoch, metric_name, value, tracker=None, time=None): if not LogMetrics.in_cluster: return metric_name = "{} @ {}".format(metric_name, rank) LogMetrics.api.post_metric( run_id, metric_name, value, metadata="{{rank: {}, epoch:{}}}".format(rank, epoch), ) if tracker and time: tracker.records.append( { "run_id": run_id, "name": metric_name, "cumulative": True, "date": str(datetime.datetime.now()), "time": str(time), "value": str(value), "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch), } )
def test_instantiation_incluster(mocker, kubernetes_api_client_incluster): mocker.patch("kubernetes.config.load_incluster_config") client = ApiClient(in_cluster=True) assert client is not None assert client.endpoint == "http://1.1.1.1:80/api/"
def test_instantiation_loadbalancer(mocker, kubernetes_api_client_loadbalancer): mocker.patch("kubernetes.config.load_kube_config") client = ApiClient(in_cluster=False) assert client is not None assert client.endpoint == "http://1.1.1.1:12345/api/"
def test_instantiation_loadbalancer(mocker, kubernetes_api_client_loadbalancer): mocker.patch('kubernetes.config.load_kube_config') client = ApiClient(in_cluster=False, service_name="rel-mlbench-master") assert client is not None assert client.endpoint == "http://1.1.1.1:12345/api/"
class LogMetrics(object): """ Use to write metric values to the Dashboard API and to Trackers Caches API client for performance reasons """ in_cluster = os.getenv('KUBERNETES_SERVICE_HOST') is not None if in_cluster: api = ApiClient() @staticmethod def log(run_id, rank, epoch, metric_name, value, tracker=None, time=None): """ Logs metrics to the Metrics API Currently only logs inside of a cluster Args: run_id (str): The id of the run in the dashboard rank (int): Rank of the current worker node epoch (float): The current epoch (fractional) metric_name (str): The name of the metric value (float / int / str): The metric value to write tracker(:obj:`mlbench_core.utils.Tracker`): The value Tracker time (float): The current time (used for Tracker) """ if not LogMetrics.in_cluster: return metric_name = "{} @ {}".format(metric_name, rank) LogMetrics.api.post_metric(run_id, metric_name, value, metadata="{{rank: {}, epoch:{}}}".format( rank, epoch)) if tracker and time: tracker.records.append({ "run_id": run_id, "name": metric_name, "cumulative": True, "date": str(datetime.datetime.now()), "time": str(time), "value": str(value), "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch) })
def test_post_metrics(mocker, kubernetes_api_client_node_port): mocker.patch("kubernetes.config.load_kube_config") rg = mocker.patch("concurrent.futures.ThreadPoolExecutor") rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.post_metric("1", "loss", 10.0, cumulative=False) assert result is not None assert result.result().json() == "a"
def test_get_run(mocker, kubernetes_api_client_node_port): mocker.patch('kubernetes.config.load_kube_config') rg = mocker.patch('concurrent.futures.ThreadPoolExecutor') rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False, service_name="rel-mlbench-master") result = client.get_run("1") assert result is not None assert result.result().json() == "a"
def test_get_worker_pods(mocker, kubernetes_api_client_node_port): mocker.patch("kubernetes.config.load_kube_config") rg = mocker.patch("concurrent.futures.ThreadPoolExecutor") rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.get_worker_pods() assert result is not None assert result.result().json() == "a"
def get_dashboard_url(): """Returns the dashboard URL of the current cluster""" loaded = setup_client_from_config() if not loaded: click.echo("No Cluster config found") return client = ApiClient(in_cluster=False, load_config=False) click.echo(client.endpoint.replace('api/', ''))
def status(name, dashboard_url): """Get the status of a benchmark run, or all runs if no name is given""" loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) ret = client.get_runs() runs = ret.result().json() if name is None: # List all runs for run in runs: del run["job_id"] del run["job_metadata"] click.echo(tabulate(runs, headers="keys")) return try: run = next(r for r in runs if r["name"] == name) except StopIteration: click.echo("Run not found") return del run["job_id"] del run["job_metadata"] click.echo(tabulate([run], headers="keys")) loss = client.get_run_metrics(run["id"], metric_filter="val_global_loss @ 0", last_n=1) prec = client.get_run_metrics(run["id"], metric_filter="val_global_Prec@1 @ 0", last_n=1) loss = loss.result() prec = prec.result() if loss.status_code < 300 and "val_global_loss @ 0" in loss.json(): val = loss.json()["val_global_loss @ 0"][0] click.echo("Current Global Loss: {0:.2f} ({1})".format( float(val["value"]), val["date"])) else: click.echo("No Validation Loss Data yet") if prec.status_code < 300 and "val_global_Prec@1 @ 0" in prec.json(): val = prec.json()["val_global_Prec@1 @ 0"][0] click.echo("Current Global Precision: {0:.2f} ({1})".format( float(val["value"]), val["date"])) else: click.echo("No Validation Precision Data yet")
def test_get_pod_metrics(mocker, kubernetes_api_client_node_port): mocker.patch("kubernetes.config.load_kube_config") rg = mocker.patch("concurrent.futures.ThreadPoolExecutor") rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.get_pod_metrics("rel-mlbench-worker-0", since=datetime.datetime.now(), summarize=100) assert result is not None assert result.result().json() == "a"
def test_create_run_official(mocker, kubernetes_api_client_node_port): mocker.patch('kubernetes.config.load_kube_config') rg = mocker.patch('concurrent.futures.ThreadPoolExecutor') rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.create_run("test_run", 5, num_cpus=4.1, max_bandwidth=10000, image='PyTorch Cifar-10 ResNet-20 Open-MPI') assert result is not None assert result.result().json() == "a"
def download(name, output, dashboard_url): """Download the results of a benchmark run""" loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) ret = client.get_runs() runs = ret.result().json() run = next(r for r in runs if r['name'] == name) ret = client.download_run_metrics(run['id']) with open(output, 'wb') as f: f.write(ret.result().content)
def test_create_run_custom(mocker, kubernetes_api_client_node_port): mocker.patch('kubernetes.config.load_kube_config') rg = mocker.patch('concurrent.futures.ThreadPoolExecutor') rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.create_run( "test_run", 5, num_cpus=4.1, max_bandwidth=10000, custom_image_name="localhost:5000/mlbench_worker:latest", custom_image_command="/.openmpi/bin/mpirun /app/main.py", custom_image_all_nodes=False) assert result is not None assert result.result().json() == "a"
def delete(name, dashboard_url): """Delete a benchmark run""" loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) ret = client.get_runs() runs = ret.result().json() try: run = next(r for r in runs if r['name'] == name) except StopIteration: click.echo('Run not found') return del run['job_id'] del run['job_metadata'] client.delete_run(run['id'])
def status(name, dashboard_url): """Get the status of a benchmark run""" loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) ret = client.get_runs() runs = ret.result().json() try: run = next(r for r in runs if r['name'] == name) except StopIteration: click.echo('Run not found') return del run['job_id'] del run['job_metadata'] click.echo(tabulate([run], headers='keys')) loss = client.get_run_metrics(run['id'], metric_filter='val_global_loss @ 0', last_n=1) prec = client.get_run_metrics(run['id'], metric_filter='val_global_Prec@1 @ 0', last_n=1) loss = loss.result() prec = prec.result() if loss.status_code < 300 and 'val_global_loss @ 0' in loss.json(): val = loss.json()['val_global_loss @ 0'][0] click.echo("Current Global Loss: {0:.2f} ({1})".format(float(val['value']), val['date'])) else: click.echo("No Validation Loss Data yet") if prec.status_code < 300 and 'val_global_Prec@1 @ 0' in prec.json(): val = prec.json()['val_global_Prec@1 @ 0'][0] click.echo("Current Global Precision: {0:.2f} ({1})".format(float(val['value']), val['date'])) else: click.echo("No Validation Precision Data yet")
def create_gcloud( num_workers, release, kubernetes_version, machine_type, disk_size, num_cpus, num_gpus, gpu_type, zone, project, preemptible, custom_value, ): from google.cloud import container_v1 import google.auth from google.auth.exceptions import DefaultCredentialsError from googleapiclient import discovery, http try: credentials, default_project = google.auth.default() except DefaultCredentialsError: raise click.UsageError( "Couldn't find gcloud credentials. Install the gcloud" " sdk ( https://cloud.google.com/sdk/docs/quickstart-linux ) and " "run 'gcloud auth application-default login' to login and create " "your credentials.") assert num_workers >= 2, "Number of workers should be at least 2" if not project: project = default_project # create cluster gclient = container_v1.ClusterManagerClient() name = "{}-{}".format(release, num_workers) name_path = "projects/{}/locations/{}/".format(project, zone) extraargs = {} if num_gpus > 0: extraargs["accelerators"] = [ container_v1.types.AcceleratorConfig(accelerator_count=num_gpus, accelerator_type=gpu_type) ] # delete existing firewall, if any firewalls = discovery.build("compute", "v1", cache_discovery=False).firewalls() existing_firewalls = firewalls.list(project=project).execute() fw_name = "{}-firewall".format(name) if any(f["name"] == fw_name for f in existing_firewalls["items"]): response = {} while not hasattr(response, "status"): try: response = firewalls.delete(project=project, firewall=fw_name).execute() except http.HttpError as e: if e.resp.status == 404: response = {} break click.echo("Wait for firewall to be available for deletion") sleep(5) response = {} while hasattr(response, "status") and response.status < response.DONE: response = gclient.get_operation(None, None, None, name=response.selfLink) sleep(1) # create cluster cluster = container_v1.types.Cluster( name=name, initial_node_count=num_workers, node_config=container_v1.types.NodeConfig( machine_type=machine_type, disk_size_gb=disk_size, preemptible=preemptible, oauth_scopes=[ "https://www.googleapis.com/auth/devstorage.full_control", ], **extraargs, ), addons_config=container_v1.types.AddonsConfig( http_load_balancing=container_v1.types.HttpLoadBalancing( disabled=True, ), horizontal_pod_autoscaling=container_v1.types. HorizontalPodAutoscaling(disabled=True, ), kubernetes_dashboard=container_v1.types.KubernetesDashboard( disabled=True, ), network_policy_config=container_v1.types.NetworkPolicyConfig( disabled=False, ), ), logging_service=None, monitoring_service=None, ) response = gclient.create_cluster(None, None, cluster, parent=name_path) # wait for cluster to load while response.status < response.DONE: response = gclient.get_operation(None, None, None, name=name_path + "/" + response.name) sleep(1) if response.status != response.DONE: raise ValueError("Cluster creation failed!") cluster = gclient.get_cluster(None, None, None, name=name_path + "/" + name) auth_req = google.auth.transport.requests.Request() credentials.refresh(auth_req) configuration = client.Configuration() configuration.host = f"https://{cluster.endpoint}:443" configuration.verify_ssl = False configuration.api_key = {"authorization": "Bearer " + credentials.token} client.Configuration.set_default(configuration) if num_gpus > 0: with request.urlopen(GCLOUD_NVIDIA_DAEMONSET) as r: dep = yaml.safe_load(r) dep["spec"]["selector"] = { "matchLabels": dep["spec"]["template"]["metadata"]["labels"] } dep = client.ApiClient()._ApiClient__deserialize( dep, "V1DaemonSet") k8s_client = client.AppsV1Api() k8s_client.create_namespaced_daemon_set("kube-system", body=dep) # create tiller service account client.CoreV1Api().create_namespaced_service_account( "kube-system", { "apiVersion": "v1", "kind": "ServiceAccount", "metadata": { "name": "tiller", "generateName": "tiller", "namespace": "kube-system", }, }, ) client.RbacAuthorizationV1beta1Api().create_cluster_role_binding({ "apiVersion": "rbac.authorization.k8s.io/v1beta1", "kind": "ClusterRoleBinding", "metadata": { "name": "tiller" }, "roleRef": { "apiGroup": "rbac.authorization.k8s.io", "kind": "ClusterRole", "name": "cluster-admin", }, "subjects": [{ "kind": "ServiceAccount", "name": "tiller", "namespace": "kube-system" }], }) # deploy tiller tiller_service = yaml.safe_load(TILLER_MANIFEST_SERVICE) tiller_dep = yaml.safe_load(TILLER_MANIFEST_DEPLOYMENT) client.CoreV1Api().create_namespaced_service("kube-system", tiller_service) client.ExtensionsV1beta1Api().create_namespaced_deployment( "kube-system", tiller_dep) sleep(1) pods = client.CoreV1Api().list_namespaced_pod(namespace="kube-system", label_selector="app=helm") tiller_pod = pods.items[0] while True: # Wait for tiller resp = client.CoreV1Api().read_namespaced_pod( namespace="kube-system", name=tiller_pod.metadata.name) if resp.status.phase != "Pending": break sleep(5) # kubernetes python doesn't currently support port forward # https://github.com/kubernetes-client/python/issues/166 ports = 44134 # resp = stream( # client.CoreV1Api().connect_get_namespaced_pod_portforward, # name=tiller_pod.metadata.name, # namespace=tiller_pod.metadata.namespace, # ports=ports # ) with subprocess.Popen([ "kubectl", "port-forward", "--namespace={}".format(tiller_pod.metadata.namespace), tiller_pod.metadata.name, "{0}:{0}".format(ports), "--server={}".format(configuration.host), "--token={}".format(credentials.token), "--insecure-skip-tls-verify=true", ]) as portforward: sleep(5) # install chart tiller = Tiller("localhost") chart = ChartBuilder({ "name": "mlbench-helm", "source": { "type": "git", "location": "https://github.com/mlbench/mlbench-helm", }, }) values = { "limits": { "workers": num_workers - 1, "gpu": num_gpus, "cpu": num_cpus } } if custom_value: # merge custom values with values for cv in custom_value: key, v = cv.split("=", 1) current = values key_path = key.split(".") for k in key_path[:-1]: if k not in current: current[k] = {} current = current[k] current[key_path[-1]] = v tiller.install_release( chart.get_helm_chart(), name=name, wait=True, dry_run=False, namespace="default", values=values, ) portforward.terminate() # open port in firewall mlbench_client = ApiClient(in_cluster=False, load_config=False) firewall_body = { "name": fw_name, "direction": "INGRESS", "sourceRanges": "0.0.0.0/0", "allowed": [{ "IPProtocol": "tcp", "ports": [mlbench_client.port] }], } firewalls.insert(project=project, body=firewall_body).execute() config = get_config() config.set("general", "provider", "gke") config.set("gke", "cluster", cluster.endpoint) write_config(config) click.echo("MLBench successfully deployed")
def create_gcloud( num_workers, release, kubernetes_version, machine_type, disk_size, num_cpus, num_gpus, gpu_type, zone, project, preemptible, custom_value, chart_location, ): import google.auth from google.auth.exceptions import DefaultCredentialsError try: credentials, default_project = google.auth.default() except DefaultCredentialsError: raise click.UsageError( "Couldn't find gcloud credentials. Install the gcloud" " sdk ( https://cloud.google.com/sdk/docs/quickstart-linux ) and " "run 'gcloud auth application-default login' to login and create " "your credentials.") if not project: project = default_project name = "{}-{}".format(release, num_workers) name_path = "projects/{}/locations/{}/clusters/".format(project, zone) click.echo("Creating Cluster") gclient, fw_name, firewalls = gcloud_create_cluster( name=name, name_path=name_path, num_workers=num_workers, num_gpus=num_gpus, gpu_type=gpu_type, machine_type=machine_type, disk_size=disk_size, preemptible=preemptible, kubernetes_version=kubernetes_version, project=project, ) cluster = gclient.get_cluster(None, None, None, name=os.path.join(name_path, name)) kube_context = setup_gcloud_kube_client(cluster.endpoint, cluster.name, cluster.zone, project) if num_gpus > 0: deploy_nvidia_daemonset() custom_chart = { "name": "mlbench-helm", "source": { "type": "git" if chart_location is None else "directory", "location": "https://github.com/mlbench/mlbench-helm" if chart_location is None else chart_location, }, } click.echo("Deploying chart") deploy_chart( num_workers=num_workers - 1, num_gpus=num_gpus, num_cpus=num_cpus - 1, release_name=name, custom_value=custom_value, custom_chart=custom_chart, kube_context=kube_context, ) # open port in firewall mlbench_client = ApiClient(in_cluster=False, load_config=False) firewall_body = { "name": fw_name, "direction": "INGRESS", "sourceRanges": "0.0.0.0/0", "allowed": [{ "IPProtocol": "tcp", "ports": [mlbench_client.port] }], } firewalls.insert(project=project, body=firewall_body).execute() add_gcloud_cluster(name, cluster, project) click.echo("MLBench successfully deployed")
def run(name, num_workers, gpu, light, dashboard_url): """Start a new run for a benchmark image""" images = list(MLBENCH_IMAGES.keys()) text_prompt = 'Benchmark: \n\n' text_prompt += '\n'.join( '[{}]\t{}'.format(i, t) for i, t in enumerate(images) ) text_prompt += '\n[{}]\tCustom Image'.format(len(images)) text_prompt += '\n\nSelection' selection = click.prompt( text_prompt, type=click.IntRange(0, len(images)), default=0 ) if selection == len(images): # run custom image image = click.prompt('Image:', type=str) image_command = click.prompt('Command:', type=str) run_on_all = click.confirm( 'Run command on all nodes (otherwise just first node):', type=bool) benchmark = { 'custom_image_name': image, 'custom_image_command': image_command, 'custom_image_all_nodes': run_on_all } else: benchmark = {'image': images[selection]} benchmark['gpu_enabled'] = gpu benchmark['light_target'] = light loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) results = [] for num_w in num_workers: current_name = '{}-{}'.format(name, num_w) res = client.create_run(current_name, num_w, **benchmark) results.append(res) for res in results: act_result = res.result() if act_result.status_code > 201: try: click.echo('Couldn\'t start run: {}'.format( act_result.json()['message'])) except json.JSONDecodeError: print(str(act_result.text)) click.echo( 'Couldn\'t start run: Status {} for request'.format( act_result.status_code)) return click.echo('Run started with name {}'.format( act_result.json()['name']))
def test_instantiation(mocker, kubernetes_api_client_node_port): mocker.patch("kubernetes.config.load_kube_config") with ApiClient(in_cluster=False) as client: assert client is not None assert client.endpoint == "http://1.1.1.1:12345/api/"
def create_aws( num_workers, release, kubernetes_version, machine_type, num_cpus, num_gpus, custom_value, ami_id, ssh_key, ): sts = boto3.client("sts") try: sts.get_caller_identity() except botocore.exceptions.ClientError: raise click.UsageError( "Couldn't find aws credentials. Install the aws" " sdk ( https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2-linux.html ) and " "run 'aws configure' to login and create " "your credentials.") name = "{}-{}".format(release, num_workers) nodeGroupName = name + "-node-group" kube_context, cf_client, stackName, cluster = aws_create_cluster( name, nodeGroupName, num_workers, machine_type, ssh_key, ami_id, kubernetes_version, ) kube_config.load_kube_config(context=kube_context) if num_gpus > 0: deploy_nvidia_daemonset_aws() deploy_chart( num_workers=num_workers - 1, num_gpus=num_gpus, num_cpus=num_cpus - 1, release_name=name, custom_value=custom_value, kube_context=kube_context, ) # open port in firewall mlbench_client = ApiClient(in_cluster=False, load_config=False) mlbench_port = mlbench_client.port r = cf_client.describe_stack_resources( StackName=stackName, LogicalResourceId="NodeSecurityGroup") secGroupId = r["StackResources"][0]["PhysicalResourceId"] ec2 = boto3.client("ec2") ec2.authorize_security_group_ingress( GroupId=secGroupId, IpPermissions=[ { "FromPort": mlbench_port, "IpProtocol": "tcp", "IpRanges": [ { "CidrIp": "0.0.0.0/0", }, ], "ToPort": mlbench_port, }, ], ) add_aws_cluster(name, cluster) click.echo("MLBench successfully deployed")
def run(name, num_workers, gpu, num_cpus, light, dashboard_url): """Start a new run for a benchmark image""" current_run_inputs = {} last_run_inputs_dir_location = os.path.join(os.environ["HOME"], ".local", "share", "mlbench") Path(last_run_inputs_dir_location).mkdir(parents=True, exist_ok=True) last_run_inputs_file_location = os.path.join(last_run_inputs_dir_location, "last_run_inputs.pkl") try: last_run_inputs = pickle.load(open(last_run_inputs_file_location, "rb")) except FileNotFoundError as e: last_run_inputs = {} images = list(MLBENCH_IMAGES.keys()) text_prompt = "Benchmark: \n\n" text_prompt += "\n".join("[{}]\t{}".format(i, t) for i, t in enumerate(images)) text_prompt += "\n[{}]\tCustom Image".format(len(images)) text_prompt += "\n\nSelection" selection = click.prompt( text_prompt, type=click.IntRange(0, len(images)), default=last_run_inputs.get("benchmark", 0), ) current_run_inputs["benchmark"] = selection if selection == len(images): # run custom image image = click.prompt("Image", type=str, default=last_run_inputs.get("image", None)) current_run_inputs["image"] = image image_command = click.prompt("Command", type=str, default=last_run_inputs.get( "image_command", None)) current_run_inputs["image_command"] = image_command benchmark = { "custom_image_name": image, "custom_image_command": image_command, } else: benchmark = {"image": images[selection]} # Backend Prompt text_prompt = "Backend: \n\n" text_prompt += "\n".join("[{}]\t{}".format(i, t) for i, t in enumerate(MLBENCH_BACKENDS)) text_prompt += "\n[{}]\tCustom Backend".format(len(MLBENCH_BACKENDS)) text_prompt += "\n\nSelection" selection = click.prompt( text_prompt, type=click.IntRange(0, len(MLBENCH_BACKENDS)), default=last_run_inputs.get("backend", 0), ) current_run_inputs["backend"] = selection if selection == len(MLBENCH_BACKENDS): backend = click.prompt("Backend", type=str, default=last_run_inputs.get( "custom_backend", None)) current_run_inputs["custom_backend"] = backend run_on_all = click.confirm( "Run command on all nodes (otherwise just first node)", default=last_run_inputs.get("run_on_all", None), ) current_run_inputs["run_on_all"] = run_on_all benchmark["custom_backend"] = backend benchmark["run_all_nodes"] = run_on_all else: benchmark["backend"] = MLBENCH_BACKENDS[selection] pickle.dump(current_run_inputs, open(last_run_inputs_file_location, "wb")) benchmark["gpu_enabled"] = gpu benchmark["light_target"] = light benchmark["num_cpus"] = num_cpus - 1 loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) results = [] for num_w in num_workers: current_name = "{}-{}".format(name, num_w) res = client.create_run(current_name, num_w, **benchmark) results.append(res) for res in results: act_result = res.result() if act_result.status_code > 201: try: click.echo("Couldn't start run: {}".format( act_result.json()["message"])) except json.JSONDecodeError: print(str(act_result.text)) click.echo("Couldn't start run: Status {} for request".format( act_result.status_code)) return click.echo("Run started with name {}".format( act_result.json()["name"]))
def create_gcloud(num_workers, release, kubernetes_version, machine_type, disk_size, num_cpus, num_gpus, gpu_type, zone, project, preemptible, custom_value): from google.cloud import container_v1 import google.auth from googleapiclient import discovery, http credentials, default_project = google.auth.default() if not project: project = default_project # create cluster gclient = container_v1.ClusterManagerClient() name = '{}-{}'.format(release, num_workers) name_path = 'projects/{}/locations/{}/'.format(project, zone) extraargs = {} if num_gpus > 0: extraargs['accelerators'] = [container_v1.types.AcceleratorConfig( accelerator_count=num_gpus, accelerator_type=gpu_type)] # delete existing firewall, if any firewalls = discovery.build( 'compute', 'v1', cache_discovery=False).firewalls() existing_firewalls = firewalls.list(project=project).execute() fw_name = '{}-firewall'.format(name) if any(f['name'] == fw_name for f in existing_firewalls['items']): response = {} while not hasattr(response, 'status'): try: response = firewalls.delete( project=project, firewall=fw_name).execute() except http.HttpError as e: if e.resp.status == 404: response = {} break click.echo("Wait for firewall to be available for deletion") sleep(5) response = {} while hasattr(response, 'status') and response.status < response.DONE: response = gclient.get_operation( None, None, None, name=response.selfLink) sleep(1) # create cluster cluster = container_v1.types.Cluster( name=name, initial_node_count=num_workers, node_config=container_v1.types.NodeConfig( machine_type=machine_type, disk_size_gb=disk_size, preemptible=preemptible, oauth_scopes=[ 'https://www.googleapis.com/auth/devstorage.full_control', ], **extraargs ), addons_config=container_v1.types.AddonsConfig( http_load_balancing=container_v1.types.HttpLoadBalancing( disabled=True, ), horizontal_pod_autoscaling= container_v1.types.HorizontalPodAutoscaling( disabled=True, ), kubernetes_dashboard=container_v1.types.KubernetesDashboard( disabled=True, ), network_policy_config=container_v1.types.NetworkPolicyConfig( disabled=False, ), ), logging_service=None, monitoring_service=None ) response = gclient.create_cluster(None, None, cluster, parent=name_path) # wait for cluster to load while response.status < response.DONE: response = gclient.get_operation( None, None, None, name=name_path + '/' + response.name) sleep(1) if response.status != response.DONE: raise ValueError('Cluster creation failed!') cluster = gclient.get_cluster( None, None, None, name=name_path + '/' + name) auth_req = google.auth.transport.requests.Request() credentials.refresh(auth_req) configuration = client.Configuration() configuration.host = f'https://{cluster.endpoint}:443' configuration.verify_ssl = False configuration.api_key = {'authorization': 'Bearer ' + credentials.token} client.Configuration.set_default(configuration) if num_gpus > 0: with request.urlopen(GCLOUD_NVIDIA_DAEMONSET) as r: dep = yaml.safe_load(r) dep['spec']['selector'] = { 'matchLabels': dep['spec']['template']['metadata']['labels'] } dep = client.ApiClient()._ApiClient__deserialize(dep, 'V1DaemonSet') k8s_client = client.AppsV1Api() k8s_client.create_namespaced_daemon_set('kube-system', body=dep) # create tiller service account client.CoreV1Api().create_namespaced_service_account( 'kube-system', { 'apiVersion': 'v1', 'kind': 'ServiceAccount', 'metadata': { 'name': 'tiller', 'generateName': 'tiller', 'namespace': 'kube-system', }, }) client.RbacAuthorizationV1beta1Api().create_cluster_role_binding( { 'apiVersion': 'rbac.authorization.k8s.io/v1beta1', 'kind': 'ClusterRoleBinding', 'metadata': { 'name': 'tiller' }, 'roleRef': { 'apiGroup': 'rbac.authorization.k8s.io', 'kind': 'ClusterRole', 'name': 'cluster-admin' }, 'subjects': [ { 'kind': 'ServiceAccount', 'name': 'tiller', 'namespace': 'kube-system' } ] }) # deploy tiller tiller_service = yaml.safe_load(TILLER_MANIFEST_SERVICE) tiller_dep = yaml.safe_load(TILLER_MANIFEST_DEPLOYMENT) client.CoreV1Api().create_namespaced_service( 'kube-system', tiller_service) client.ExtensionsV1beta1Api().create_namespaced_deployment( 'kube-system', tiller_dep) sleep(1) pods = client.CoreV1Api().list_namespaced_pod( namespace='kube-system', label_selector='app=helm' ) tiller_pod = pods.items[0] while True: # Wait for tiller resp = client.CoreV1Api().read_namespaced_pod( namespace='kube-system', name=tiller_pod.metadata.name ) if resp.status.phase != 'Pending': break sleep(5) # kubernetes python doesn't currently support port forward # https://github.com/kubernetes-client/python/issues/166 ports = 44134 # resp = stream( # client.CoreV1Api().connect_get_namespaced_pod_portforward, # name=tiller_pod.metadata.name, # namespace=tiller_pod.metadata.namespace, # ports=ports # ) with subprocess.Popen([ 'kubectl', 'port-forward', '--namespace={}'.format(tiller_pod.metadata.namespace), tiller_pod.metadata.name, '{0}:{0}'.format(ports), '--server={}'.format(configuration.host), '--token={}'.format(credentials.token), '--insecure-skip-tls-verify=true']) as portforward: sleep(5) # install chart tiller = Tiller('localhost') chart = ChartBuilder( { 'name': 'mlbench-helm', 'source': { 'type': 'git', 'location': 'https://github.com/mlbench/mlbench-helm' }}) values = { 'limits': { 'workers': num_workers - 1, 'gpu': num_gpus, 'cpu': num_cpus } } if custom_value: # merge custom values with values for cv in custom_value: key, v = cv.split("=", 1) current = values key_path = key.split(".") for k in key_path[:-1]: if k not in current: current[k] = {} current = current[k] current[key_path[-1]] = v tiller.install_release( chart.get_helm_chart(), name=name, wait=True, dry_run=False, namespace='default', values=values) portforward.terminate() # open port in firewall mlbench_client = ApiClient(in_cluster=False, load_config=False) firewall_body = { "name": fw_name, "direction": "INGRESS", "sourceRanges": "0.0.0.0/0", "allowed": [ {"IPProtocol": "tcp", "ports": [mlbench_client.port]} ] } firewalls.insert(project=project, body=firewall_body).execute() config = get_config() config.set('general', 'provider', 'gke') config.set('gke', 'cluster', cluster.endpoint) write_config(config) click.echo("MLBench successfully deployed")
def run(name, num_workers, gpu, light, dashboard_url): """Start a new run for a benchmark image""" images = list(MLBENCH_IMAGES.keys()) text_prompt = "Benchmark: \n\n" text_prompt += "\n".join("[{}]\t{}".format(i, t) for i, t in enumerate(images)) text_prompt += "\n[{}]\tCustom Image".format(len(images)) text_prompt += "\n\nSelection" selection = click.prompt(text_prompt, type=click.IntRange(0, len(images)), default=0) if selection == len(images): # run custom image image = click.prompt("Image", type=str) image_command = click.prompt("Command", type=str) run_on_all = click.confirm( "Run command on all nodes (otherwise just first node)") benchmark = { "custom_image_name": image, "custom_image_command": image_command, "custom_image_all_nodes": run_on_all, } else: benchmark = {"image": images[selection]} benchmark["gpu_enabled"] = gpu benchmark["light_target"] = light loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) results = [] for num_w in num_workers: current_name = "{}-{}".format(name, num_w) res = client.create_run(current_name, num_w, **benchmark) results.append(res) for res in results: act_result = res.result() if act_result.status_code > 201: try: click.echo("Couldn't start run: {}".format( act_result.json()["message"])) except json.JSONDecodeError: print(str(act_result.text)) click.echo("Couldn't start run: Status {} for request".format( act_result.status_code)) return click.echo("Run started with name {}".format( act_result.json()["name"]))
def test_instantiation_url(): client = ApiClient(url="1.1.1.1:12345") assert client is not None assert client.endpoint == "http://1.1.1.1:12345/api/"
def charts(folder, filter, dashboard_url): """Chart the results of benchmark runs Save generated charts in FOLDER """ folder = Path(folder) if not folder.exists(): folder.mkdir(parents=True) loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) ret = client.get_runs() runs = ret.result().json() runs = [r for r in runs if r["state"] == "finished"] if filter: runs = [r for r in runs if filter in r["name"]] options = {i: r for i, r in enumerate(runs, start=0)} if len(options) < 2: click.echo("At least two finished runs are needed to create a summary") return options["all"] = {"name": "*all runs*"} prompt = 'Select the runs to generate a summary for (e.g. "0 1 2"): \n\t{}'.format( "\n\t".join("{} [{}]".format(r["name"], i) for i, r in options.items())) choice = click.prompt( prompt, default=0, type=click.Choice([options.keys()]), show_choices=False, value_proc=lambda x: runs if "all" in x else [options[int(i)] for i in x.split(" ")], ) if len(choice) < 2: click.echo("At least two finished runs are needed to create a summary") return results = [] def _get_metric(name, run): """Gets a metric from the dashboard.""" name = "global_cum_{} @ 0".format(name) return float( client.get_run_metrics(run["id"], metric_filter=name, last_n=1).result().json()[name][0]["value"]) for run in choice: agg = _get_metric("agg", run) backprop = _get_metric("backprop", run) batch_load = _get_metric("batch_load", run) comp_loss = _get_metric("comp_loss", run) comp_metrics = _get_metric("comp_metrics", run) fwd_pass = _get_metric("fwd_pass", run) opt_step = _get_metric("opt_step", run) compute = (fwd_pass + comp_loss + backprop + opt_step + (agg if run["num_workers"] == 1 else 0)) communicate = agg if run["num_workers"] != 1 else 0 results.append(( run["name"], compute, communicate, comp_metrics, batch_load, str(run["num_workers"]), )) results = sorted(results, key=lambda x: x[5]) names, compute, communicate, metrics, batch_load, num_workers = zip( *results) width = 0.35 fig, ax = plt.subplots() ax.bar(num_workers, compute, width, label="Compute") ax.bar(num_workers, communicate, width, label="Communication") ax.set_ylabel("Time (s)") ax.set_title("Total time by number of workers") ax.legend() plt.savefig(folder / "total_time.png", dpi=150) fig, ax = plt.subplots() combined = [c + r for _, c, r, _, _, _ in results] speedup = [combined[0] / c for c in combined] ax.bar(num_workers, speedup, width) ax.set_ylabel("Speedup factor") ax.set_title("Speedup") plt.savefig(folder / "speedup.png", dpi=150) fig, ax = plt.subplots() ax.bar(num_workers, compute, width, label="Compute") ax.bar(num_workers, communicate, width, label="Communication") ax.bar(num_workers, metrics, width, label="Metrics Computation") ax.bar(num_workers, batch_load, width, label="Batch Loading") ax.set_ylabel("Time (s)") ax.set_title("Total time by number of workers") ax.legend() plt.savefig(folder / "time_for_all_phases.png", dpi=150) click.echo("Summary created in {}".format(folder))