def launch_dask(n_gpus, min_gpus, k8s, adapt, worker_spec): if k8s: if worker_spec is None: worker_spec = default_worker_spec_fname print(f'Creating a default K8S worker spec at {worker_spec}') with open(worker_spec, "w") as yaml_file: yaml_file.write(default_worker_spec) cluster = KubeCluster.from_yaml(worker_spec) if adapt: cluster.adapt(minimum=min_gpus, maximum=n_gpus) print( f'Launching Adaptive K8S Dask cluster with [{min_gpus}, {n_gpus}] workers' ) else: cluster.scale(n_gpus) print(f'Launching K8S Dask cluster with {n_gpus} workers') sleep(10) else: cluster = LocalCUDACluster(ip="", n_workers=n_gpus) print(f'Launching Local Dask cluster with {n_gpus} GPUs') client = Client(cluster) print(client) print(cluster) return client, cluster
def test_pod_from_yaml_expand_env_vars(image_name, loop, ns): try: os.environ["FOO_IMAGE"] = image_name test_yaml = { "kind": "Pod", "metadata": { "labels": { "app": "dask", "component": "dask-worker" } }, "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1" ], "image": '${FOO_IMAGE}', 'imagePullPolicy': 'IfNotPresent', "name": "dask-worker" }] } } with tmpfile(extension='yaml') as fn: with open(fn, mode='w') as f: yaml.dump(test_yaml, f) with KubeCluster.from_yaml(f.name, loop=loop, namespace=ns) as cluster: assert cluster.pod_template.spec.containers[0].image == image_name finally: del os.environ['FOO_IMAGE']
async def test_pod_from_yaml_expand_env_vars(image_name, ns, auth): try: os.environ["FOO_IMAGE"] = image_name test_yaml = { "kind": "Pod", "metadata": {"labels": {"app": "dask", "component": "dask-worker"}}, "spec": { "containers": [ { "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", ], "image": "${FOO_IMAGE}", "imagePullPolicy": "IfNotPresent", "name": "dask-worker", } ] }, } with tmpfile(extension="yaml") as fn: with open(fn, mode="w") as f: yaml.dump(test_yaml, f) async with KubeCluster.from_yaml( f.name, namespace=ns, auth=auth, **cluster_kwargs ) as cluster: assert cluster.pod_template.spec.containers[0].image == image_name finally: del os.environ["FOO_IMAGE"]
def make_kube(pod_spec, **kws): """Create a dask_kubernetes.KubeCluster. pod_spec is either the name of a YAML file containg the worker pod specification or a dict containing the specification directly. kws is passed to KubeCluster.from_yaml or .from_dict. """ from dask_kubernetes import KubeCluster if isistance(pod_spec, str): return KubeCluster.from_yaml(pod_spec, **kws) else: return KubeCluster.from_dict(pod_spec, **kws)
async def test_pod_from_yaml(image_name, ns, auth): test_yaml = { "kind": "Pod", "metadata": { "labels": { "app": "dask", "component": "dask-worker" } }, "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1", ], "image": image_name, "imagePullPolicy": "IfNotPresent", "name": "dask-worker", }] }, } with tmpfile(extension="yaml") as fn: with open(fn, mode="w") as f: yaml.dump(test_yaml, f) async with KubeCluster.from_yaml(f.name, namespace=ns, auth=auth, **cluster_kwargs) as cluster: assert cluster.namespace == ns cluster.scale(2) await cluster async with Client(cluster, asynchronous=True) as client: future = client.submit(lambda x: x + 1, 10) result = await future.result(timeout=10) assert result == 11 start = time() while len(cluster.scheduler_info["workers"]) < 2: await asyncio.sleep(0.1) assert time() < start + 20, "timeout" # Ensure that inter-worker communication works well futures = client.map(lambda x: x + 1, range(10)) total = client.submit(sum, futures) assert (await total) == sum(map(lambda x: x + 1, range(10))) assert all((await client.has_what()).values())
def test_pod_from_yaml(image_name, loop, ns): test_yaml = { "kind": "Pod", "metadata": { "labels": { "app": "dask", "component": "dask-worker" } }, "spec": { "containers": [{ "args": [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", "1" ], "image": image_name, 'imagePullPolicy': 'IfNotPresent', "name": "dask-worker" }] } } with tmpfile(extension='yaml') as fn: with open(fn, mode='w') as f: yaml.dump(test_yaml, f) with KubeCluster.from_yaml(f.name, loop=loop, namespace=ns) as cluster: assert cluster.namespace == ns cluster.scale(2) with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) result = future.result(timeout=10) assert result == 11 start = time() while len(cluster.scheduler.workers) < 2: sleep(0.1) assert time() < start + 10, 'timeout' # Ensure that inter-worker communication works well futures = client.map(lambda x: x + 1, range(10)) total = client.submit(sum, futures) assert total.result() == sum(map(lambda x: x + 1, range(10))) assert all(client.has_what().values())
#!/usr/bin/env python import logging import distributed import dask.array as da from dask_kubernetes import KubeCluster logging.basicConfig(level=logging.INFO) with KubeCluster.from_yaml('/usr/src/app/specs/worker-spec.yaml') as cluster: cluster.scale(4) # Connect dask to the cluster client = distributed.Client(cluster) # Create an array and calculate the mean array = da.ones((1000, 1000, 1000), chunks=(100, 100, 10)) print(array.mean().compute()) # Should print 1.0
#!/usr/bin/env python # coding: utf-8 # In[14]: from dask.distributed import Client # In[15]: from dask_kubernetes import KubeCluster # In[16]: cluster = KubeCluster.from_yaml("worker-spec.yml") cluster.scale(1) # In[13]: cluster.close() # In[ ]:
import torch import ase from ase.db import connect from al_mlp.offline_active_learner import OfflineActiveLearner from al_mlp.base_calcs.morse import MultiMorse from al_mlp.atomistic_methods import Relaxation from amptorch.trainer import AtomsTrainer from dask_kubernetes import KubeCluster from dask.distributed import Client cluster = KubeCluster.from_yaml( "/home/jovyan/al_mlp/examples/offline_al_dask_example/dask-worker-cpu-spec.yml" ) client = Client(cluster) cluster.adapt(minimum=0, maximum=4) # only necessary to upload egg file to # workers if al_mlp is not in the environment but files_list = ["al_mlp-0.1-py3.6.egg"] # for i in range(len(files_list)): # fname = files_list[i] # with open(fname, "rb") as f: # data = f.read() # def _worker_upload(dask_worker, *, data, fname):
#!/usr/bin/env python # coding: utf-8 # In[2]: from dask_kubernetes import KubeCluster # In[ ]: # Initial attempt at creation (also failed do to networking) #tag::create_in_default[] cluster = KubeCluster.from_yaml('worker-spec.yaml') #end::create_in_default[] # In[5]: # This is the one where it failed because I was running it outside a cluster and it could not communicate #tag::create_in_namespace[] cluster = KubeCluster.from_yaml('worker-spec.yaml', namespace='dask') #end::create_in_namespace[] # In[3]: cluster.adapt(minimum=1, maximum=100)
def _get_kubernetes_cluster(worker_template_path=WORKER_TEMPLATE_PATH): from dask_kubernetes import KubeCluster cluster = KubeCluster.from_yaml(worker_template_path) return Client(cluster)
cv=10, return_train_score=False, verbose=100) ################################################################## from dask_kubernetes import KubeCluster from dask.distributed import Client import os os.environ['SKLEARN_SITE_JOBLIB'] = "1" from dask.distributed import Client from sklearn import datasets, linear_model from sklearn.model_selection import cross_validate from pyitab.ext.sklearn._validation import cross_validate import joblib cluster = KubeCluster.from_yaml('pods.yml') pods = cluster.scale(6) client = Client(cluster.scheduler_address) diabetes = datasets.load_diabetes() from dask.array import from_array X = from_array(diabetes.data, chunks='auto') y = from_array(diabetes.target, chunks='auto') model = linear_model.LinearRegression() with joblib.parallel_backend('dask', scatter=[model, X, y]): cv_results = cross_validate(model, X, y,
from dask_kubernetes import KubeCluster import os import time cluster = KubeCluster.from_yaml('worker-spec.yaml', port=8786, diagnostics_port=8787) # Get ENV variables for cluster scaling config # More info: https://github.com/dask/distributed/blob/master/distributed/deploy/adaptive.py min_workers_number = int(os.getenv('DASK_CLUSTER_MIN_WORKERS', 1)) max_workers_number = int(os.getenv('DASK_CLUSTER_MAX_WORKERS', 5)) startup_cost = os.getenv('DASK_CLUSTER_STARTUP_COST', '10s') target_duration = os.getenv('DASK_CLUSTER_TARGET_DURATION', '10s') wait_count = int(os.getenv('DASK_CLUSTER_WAIT_COUNT', 3)) check_interval = os.getenv('DASK_CLUSTER_CHECK_INTERVAL', '5s') cluster.adapt(minimum=min_workers_number, maximum=max_workers_number, startup_cost=startup_cost, target_duration=target_duration, wait_count=wait_count, interval=check_interval) while True: time.sleep(0.1)
from dask_kubernetes import KubeCluster import numpy as np # In[ ]: #tag::remote_lb_deploy[] # In[2]: # Specify a remote deployment using a load blanacer, necessary for communication with notebook from cluster dask.config.set({"kubernetes.scheduler-service-type": "LoadBalancer"}) # In[4]: cluster = KubeCluster.from_yaml('worker-spec.yaml', namespace='dask', deploy_mode='remote') # In[ ]: #end::remote_lb_deploy[] # In[5]: cluster.adapt(minimum=1, maximum=100) # In[6]: # Example usage from dask.distributed import Client import dask.array as da
sigma=0.2, ibrion=2, nsw=1000, #lorbit=11, potim=0.2, isif=0, #ediffg=-0.02, #ediff=1e-6, lcharg=False, lwave=False, lreal=False, ispin=2, isym=0) # Run between 0 and 4 1-core/1-gpu workers on the kube cluster cluster = KubeCluster.from_yaml('worker-cpu-spec.yml') client = Client(cluster) #cluster.adapt(minimum=0, maximum=10) cluster.scale(10) files_list = ['deap_ga.py', 'fillPool.py', 'mutations.py', 'utils.py'] for i in range(len(files_list)): fname = files_list[i] with open(fname, 'rb') as f: data = f.read() def _worker_upload(dask_worker, *, data, fname): dask_worker.loop.add_callback( callback=dask_worker.upload_file, comm=None, # not used filename=fname,
from dask_kubernetes import KubeCluster import dask.config #import dask.distributed dask.config.set({'kubernetes.name': 'myproject'}) cluster = KubeCluster.from_yaml('/worker-spec.yml') cluster.scale_up(2)