def __init__(self, local_client_n_workers, local_client_threads_per_worker, yarn_client_n_workers, yarn_client_worker_vcores, yarn_client_worker_memory, verbose=False): host_ip = get_host_ip_address() self.local_cluster = LocalCluster( n_workers=local_client_n_workers, threads_per_worker=local_client_threads_per_worker, processes=True, host=host_ip) self.local_client = Client(address=self.local_cluster, timeout='2s') self.yarn_cluster = YarnCluster( n_workers=yarn_client_n_workers, worker_vcores=yarn_client_worker_vcores, worker_memory=yarn_client_worker_memory, environment="python:///usr/bin/python3") self.yarn_client = Client(self.yarn_cluster) self.wait_container_resource_alloc() self.local_client_n_workers = local_client_n_workers self.yarn_client_n_workers = yarn_client_n_workers self.task_counter = -1 self.yarn_client_n_workers = yarn_client_n_workers self.verbose = verbose
def test_from_application_id(skein_client, conda_env): with YarnCluster( environment=conda_env, worker_memory="512 MiB", scheduler_memory="512 MiB", name="test-from-application-id", skein_client=skein_client, ) as cluster: # Connect to the application with the application id cluster2 = YarnCluster.from_application_id(cluster.app_id, skein_client) cluster2.scale(1) start = time.time() while len(cluster2.workers()) != 1: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" del cluster2 # Cluster is still running, finalizer not run in cluster2 assert len(cluster.workers()) == 1 check_is_shutdown(skein_client, cluster.app_id)
def test_from_current(skein_client, conda_env, monkeypatch, tmpdir): # Not running in a container with pytest.raises(ValueError) as exc: YarnCluster.from_current() assert str(exc.value) == "Not running inside a container" with YarnCluster( environment=conda_env, worker_memory="512 MiB", scheduler_memory="512 MiB", name="test-from-current", skein_client=skein_client, ) as cluster: # Patch environment so it looks like a container container_id = "container_1526134340424_0012_01_000005" cont_dir = tmpdir.mkdir(container_id) with open(str(cont_dir.join(".skein.crt")), "wb") as fil: fil.write(skein_client.security._get_bytes("cert")) with open(str(cont_dir.join(".skein.pem")), "wb") as fil: fil.write(skein_client.security._get_bytes("key")) for key, val in [ ("SKEIN_APPLICATION_ID", cluster.app_id), ("CONTAINER_ID", container_id), ("SKEIN_APPMASTER_ADDRESS", cluster.application_client.address), ("LOCAL_DIRS", str(tmpdir)), ]: monkeypatch.setenv(key, val) import skein.core monkeypatch.setattr(skein.core, "properties", skein.core.Properties()) cluster2 = YarnCluster.from_current() assert cluster2.app_id == cluster.app_id assert cluster2.scheduler_address == cluster.scheduler_address # Smoketest method cluster2.scale(1) start = time.time() while len(cluster2.workers()) != 1: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" del cluster2 # Cluster is still running, finalizer not run in cluster2 assert len(cluster.workers()) == 1 check_is_shutdown(skein_client, cluster.app_id)
def test_dask_yarn(): try: from dask_yarn import YarnCluster except: return # Validate dask_yarn configuration cluster = YarnCluster() client = Client(cluster) cluster.scale(4) x = da.sum(np.ones(5)) x.compute()
def test_from_specification_errors(): bad_spec = skein.ApplicationSpec.from_yaml(""" name: bad_spec services: bad: resources: memory: 1 GiB vcores: 1 script: exit 1 """) with pytest.raises(ValueError): YarnCluster.from_specification(bad_spec) with pytest.raises(TypeError): YarnCluster.from_specification(object())
def test_widget_and_html_reprs(skein_client, conda_env): pytest.importorskip("ipywidgets") with YarnCluster( environment=conda_env, deploy_mode="local", worker_memory="256 MiB", name="test-widget", skein_client=skein_client, ) as cluster: # Smoke test widget cluster._widget() # Test non-widget html repr assert cluster.app_id in cluster._repr_html_() assert "0" in cluster._widget_status() # Scale up and wait cluster.scale(1) start = time.time() while len(cluster._observed) != 1: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" assert "1" in cluster._widget_status() # Scale down cluster.scale(1) check_is_shutdown(skein_client, cluster.app_id)
def test_logs(conda_env, skein_client): with YarnCluster( environment=conda_env, deploy_mode="local", worker_memory="256 MiB", name="test-widget", skein_client=skein_client, ) as cluster: cluster.scale(2) start = time.time() while len(cluster._observed) != 2: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(2)" logs = cluster.logs() assert len(logs) == 3 logs = cluster.logs(scheduler=True, workers=False) assert len(logs) == 1 logs = cluster.logs(scheduler=False, workers=False) assert len(logs) == 0 logs = cluster.logs(scheduler=False, workers=True) assert len(logs) == 2 check_is_shutdown(skein_client, cluster.app_id)
def test_basic(skein_client, conda_env): with YarnCluster(environment=conda_env, worker_memory='512 MiB', scheduler_memory='512 MiB', name='test-basic', skein_client=skein_client) as cluster: # Smoketest repr repr(cluster) # Scale up cluster.scale(2) with Client(cluster) as client: future = client.submit(inc, 10) assert future.result() == 11 client.get_versions(check=True) # Check that 2 workers exist start = time.time() while len(cluster.workers()) != 2: time.sleep(0.1) assert time.time() < start + 5, "timeout cluster.scale(2)" # Scale down cluster.scale(1) start = time.time() while len(cluster.workers()) != 1: time.sleep(0.1) assert time.time() < start + 5, "timeout cluster.scale(1)" check_is_shutdown(skein_client, cluster.app_id)
def test_basic(deploy_mode, skein_client, conda_env): with YarnCluster( environment=conda_env, deploy_mode=deploy_mode, worker_memory="512 MiB", scheduler_memory="512 MiB", name="test-basic", skein_client=skein_client, dashboard_address=":8787", port=8786, worker_options={"resources": { "FOO": "BAZ" }}, worker_class="dask.distributed.Nanny", ) as cluster: # Smoketest repr if bokeh_installed: assert cluster.dashboard_link is not None if deploy_mode == "local": assert ":8787" in cluster.dashboard_link if deploy_mode == "local": assert ":8786" in cluster.scheduler_address # Scale up cluster.scale(2) with Client(cluster) as client: future = client.submit(inc, 10) assert future.result() == 11 client.get_versions(check=True) resource_tags = client.run( lambda dask_worker: dask_worker.total_resources) assert {"FOO": "BAZ"} in list(resource_tags.values()) # Check that 2 workers exist start = time.time() while len(cluster.workers()) != 2: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(2)" # Scale down cluster.scale(1) start = time.time() while len(cluster.workers()) != 1: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" check_is_shutdown(skein_client, cluster.app_id)
def do_postprocessing(s3_bucket, s3_bucket_prefix): fs = S3FileSystem() with fs.open(f'{s3_bucket}/{s3_bucket_prefix}/config.json', 'r') as f: cfg = json.load(f) ec2 = boto3.client('ec2') with open('/mnt/var/lib/info/job-flow.json', 'r') as f: job_flow_info = json.load(f) for instance_group in job_flow_info['instanceGroups']: if instance_group['instanceRole'].lower() == 'core': instance_type = instance_group['instanceType'] instance_count = instance_group['requestedInstanceCount'] instance_info = ec2.describe_instance_types(InstanceTypes=[instance_type]) dask_worker_vcores = cfg['aws'].get('emr', {}).get('dask_worker_vcores', 2) instance_memory = instance_info['InstanceTypes'][0]['MemoryInfo'][ 'SizeInMiB'] instance_ncpus = instance_info['InstanceTypes'][0]['VCpuInfo'][ 'DefaultVCpus'] n_dask_workers = instance_count * instance_ncpus // dask_worker_vcores worker_memory = round(instance_memory / instance_ncpus * dask_worker_vcores * 0.95) cluster = YarnCluster(deploy_mode='local', worker_vcores=dask_worker_vcores, worker_memory='{} MiB'.format(worker_memory), n_workers=n_dask_workers) client = Client(cluster) # noqa E841 results_s3_loc = f'{s3_bucket}/{s3_bucket_prefix}/results' combine_results(fs, results_s3_loc, cfg) aws_conf = cfg.get('postprocessing', {}).get('aws', {}) if 'athena' in aws_conf: tbl_prefix = s3_bucket_prefix.split('/')[-1] if not tbl_prefix: tbl_prefix = cfg['aws']['job_identifier'] create_athena_tables(aws_conf, tbl_prefix, s3_bucket, f'{s3_bucket_prefix}/results/parquet') keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries)
def test_from_specification(skein_client, conda_env, tmpdir, loop): spec = _make_specification(environment=conda_env, worker_memory='512 MB', scheduler_memory='512 MB', name=APPNAME) fn = os.path.join(str(tmpdir), 'spec.yaml') with open(fn, 'w') as f: f.write(spec.to_yaml()) with YarnCluster.from_specification(fn, skein_client=skein_client) as cluster: with Client(cluster, loop=loop): pass check_is_shutdown(skein_client, cluster.app_id)
def test_from_specification(skein_client, conda_env, tmpdir): spec = _make_specification( environment=conda_env, worker_memory="512 MiB", scheduler_memory="512 MiB", name="dask-yarn-test-from-specification", ) fn = os.path.join(str(tmpdir), "spec.yaml") with open(fn, "w") as f: f.write(spec.to_yaml()) with YarnCluster.from_specification(fn, skein_client=skein_client) as cluster: with Client(cluster): pass check_is_shutdown(skein_client, cluster.app_id)
def test_basic(deploy_mode, skein_client, conda_env): with YarnCluster( environment=conda_env, deploy_mode=deploy_mode, worker_memory="512 MiB", scheduler_memory="512 MiB", name="test-basic", skein_client=skein_client, dashboard_address=":8787", port=8786, ) as cluster: # Smoketest repr repr(cluster) if bokeh_installed: assert cluster.dashboard_link is not None if deploy_mode == "local": assert ":8787" in cluster.dashboard_link if deploy_mode == "local": assert ":8786" in cluster.scheduler_address # Scale up cluster.scale(2) with Client(cluster) as client: future = client.submit(inc, 10) assert future.result() == 11 client.get_versions(check=True) # Check that 2 workers exist start = time.time() while len(cluster.workers()) != 2: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(2)" # Scale down cluster.scale(1) start = time.time() while len(cluster.workers()) != 1: time.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" check_is_shutdown(skein_client, cluster.app_id)
def test_basic(skein_client, conda_env, loop): with YarnCluster(environment=conda_env, worker_memory='512 MB', scheduler_memory='512 MB', name=APPNAME, skein_client=skein_client) as cluster: cluster.scale(2) with Client(cluster, loop=loop) as client: future = client.submit(inc, 10) assert future.result() == 11 start = time.time() while len(client.scheduler_info()['workers']) < 2: time.sleep(0.1) assert time.time() < start + 5 client.get_versions(check=True) check_is_shutdown(skein_client, cluster.app_id)
def test_adapt(skein_client, conda_env): with YarnCluster( environment=conda_env, deploy_mode="local", worker_memory="256 MiB", name="test-adapt", skein_client=skein_client, ) as cluster: cluster.adapt() with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) result = future.result() assert result == 11 start = time.time() while cluster.workers(): time.sleep(0.1) assert time.time() < start + 30, "auto-scaledown timeout" check_is_shutdown(skein_client, cluster.app_id)
async def test_basic_async(deploy_mode, skein_client, conda_env): async with YarnCluster( environment=conda_env, deploy_mode=deploy_mode, worker_memory="512 MiB", scheduler_memory="512 MiB", name="test-basic-async", skein_client=skein_client, asynchronous=True, ) as cluster: # Smoketest repr repr(cluster) if bokeh_installed: assert cluster.dashboard_link is not None # Scale up await cluster.scale(2) async with Client(cluster, asynchronous=True) as client: result = await client.submit(inc, 10) assert result == 11 await client.get_versions(check=True) # Check that 2 workers exist start = time.time() while len(await cluster.workers()) != 2: await asyncio.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(2)" # Scale down await cluster.scale(1) start = time.time() while len(await cluster.workers()) != 1: await asyncio.sleep(0.1) assert time.time() < start + 30, "timeout cluster.scale(1)" check_is_shutdown(skein_client, cluster.app_id)
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) dask_config = pipeline_context.executor_config check.param_invariant( isinstance(pipeline_context.executor_config, DaskConfig), 'pipeline_context', 'Expected executor_config to be DaskConfig got {}'.format( pipeline_context.executor_config), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.environment_dict.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') check.invariant( pipeline_context.instance.is_persistent, 'Dask execution requires a persistent DagsterInstance', ) # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS', ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = dask_config.cluster_type if cluster_type == 'local': from dask.distributed import LocalCluster cluster = LocalCluster(**dask_config.build_dict(pipeline_name)) elif cluster_type == 'yarn': from dask_yarn import YarnCluster cluster = YarnCluster(**dask_config.build_dict(pipeline_name)) elif cluster_type == 'ssh': from dask.distributed import SSHCluster cluster = SSHCluster(**dask_config.build_dict(pipeline_name)) elif cluster_type == 'pbs': from dask_jobqueue import PBSCluster cluster = PBSCluster(**dask_config.build_dict(pipeline_name)) elif cluster_type == 'kube': from dask_kubernetes import KubeCluster cluster = KubeCluster(**dask_config.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'runConfigData': environment_dict, 'mode': pipeline_context.mode_def.name, 'executionMetadata': { 'runId': pipeline_context.pipeline_run.run_id }, 'stepKeys': [step.key], } } dask_task_name = '%s.%s' % (pipeline_name, step.key) future = client.submit( query_on_dask_worker, pipeline_context.pipeline. get_reconstructable_repository(), variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event
tf_addrs[i] = "%s:%d" % (tf_ip, ports[tf_ip]) tf_spec = {'ps': tf_addrs[0:nps], 'worker': tf_addrs[nps:n_tf_servers]} return tf_spec, dask_spec if __name__ == "__main__": mdl_args_str = sys.argv[1] env_pack_path = sys.argv[2] cluster_mode = 'local' # default cluster mode is local if len(sys.argv) >= 4 and sys.argv[3] == 'remote': cluster_mode = 'remote' # create dask cluster from yarn or local cluster = YarnCluster(environment=env_pack_path, n_workers=2, worker_vcores=1, worker_memory='500MiB', deploy_mode=cluster_mode) time.sleep(3) # wait a while for cluster setup finish # create spec for tf servers nps, nworker = 1, 1 tf_spec, dask_spec = create_spec(cluster, nps=nps, nworker=nworker) print(tf_spec, dask_spec) # create client for cluster to submit job client = Client(cluster) # submit job ps_jobs = [ client.submit(tf_job, tf_spec,
from dask_yarn import YarnCluster from dask.distributed import Client import dask.array as da import numpy as np cluster = YarnCluster() client = Client(cluster) x = da.sum(np.ones(5)) x.compute()
# To add a new cell, type '# %%' # To add a new markdown cell, type '# %% [markdown]' # %% import dask.dataframe as dd import pandas as pd from datetime import datetime import numpy as np # Création d'un client pour les outils de visualisation from dask.distributed import Client, progress from dask_yarn import YarnCluster cluster = YarnCluster() client = Client(cluster) cluster.adapt() client # %% [markdown] # # Chargement des dataset # %% [markdown] # ## YouGov - Wearing Mask in public # %% start = datetime.now() ##Chargement dataset df = dd.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";") ##Transformation du dataset = 1 ligne par date/pays
from dask_yarn import YarnCluster from dask.distributed import Client import dask.dataframe as dd import gcsfs # Create a cluster where each worker has two cores and eight GB of memory cluster = YarnCluster(environment='environnement.tar.gz', worker_vcores=2, worker_memory="3GB") # Scale out to ten such workers cluster.scale(2) # Connect to the cluster client = Client(cluster) df = dd.read_parquet('hdfs:///raw/sample.parquet', engine="pyarrow") df = df.persist() print(df) # gcloud compute ssh hadoop-formation -m --project=after-yesterday-217007 -- -L 1080:hadoop-formation -m:8088 -N -n
class DualClientFuture(): def __init__(self, local_client_n_workers, local_client_threads_per_worker, yarn_client_n_workers, yarn_client_worker_vcores, yarn_client_worker_memory, verbose=False): host_ip = get_host_ip_address() self.local_cluster = LocalCluster( n_workers=local_client_n_workers, threads_per_worker=local_client_threads_per_worker, processes=True, host=host_ip) self.local_client = Client(address=self.local_cluster, timeout='2s') self.yarn_cluster = YarnCluster( n_workers=yarn_client_n_workers, worker_vcores=yarn_client_worker_vcores, worker_memory=yarn_client_worker_memory, environment="python:///usr/bin/python3") self.yarn_client = Client(self.yarn_cluster) self.wait_container_resource_alloc() self.local_client_n_workers = local_client_n_workers self.yarn_client_n_workers = yarn_client_n_workers self.task_counter = -1 self.yarn_client_n_workers = yarn_client_n_workers self.verbose = verbose def wait_container_resource_alloc(self): while True: waiting_containers = [ yarn_container_obj for yarn_container_obj in self.yarn_cluster.workers() if str(yarn_container_obj.state) == 'WAITING' ] if len(waiting_containers) == 0: break time.sleep(1.0) def submit(self, func, *args, **kwargs): if self.verbose == True: print('total n workers: {}'.format(self.local_client_n_workers + self.yarn_client_n_workers)) self.task_counter += 1 remainder = self.task_counter % (self.local_client_n_workers + self.yarn_client_n_workers) # if remainder <= (self.local_client_n_workers-1): # if self.verbose==True: # print('remainder: {}, n_local_worker: {}, running on local'.format(remainder, self.local_client_n_workers)) # future = self.local_client.submit(func, *args, **kwargs) # else: # if self.verbose==True: # print('remainder: {}, n_local_worker: {}, running on remote'.format(remainder, self.local_client_n_workers)) # func = yarn_directory_normalizer(func) # future = self.yarn_client.submit(func, None, *args, **kwargs) if remainder <= (self.yarn_client_n_workers - 1): if self.verbose == True: print('remainder: {}, n_local_worker: {}, running on remote'. format(remainder, self.local_client_n_workers)) func = yarn_directory_normalizer(func) future = self.yarn_client.submit(func, None, *args, **kwargs) else: if self.verbose == True: print('remainder: {}, n_local_worker: {}, running on local'. format(remainder, self.local_client_n_workers)) future = self.local_client.submit(func, *args, **kwargs) return future.result() def get_worker_ip_addresses(self): while True: yarn_container_objects = self.yarn_cluster.workers() if len(yarn_container_objects) == self.yarn_client_n_workers: break time.sleep(0.1) ip_addrs = set() for yarn_container_object in yarn_container_objects: ip_addrs.add( yarn_container_object.yarn_node_http_address.split('.') [0].replace('-', '.')[3:]) return list(ip_addrs) def submit_per_node(self, func, *args, **kwargs): func = yarn_directory_normalizer(func) ip_addrs = self.get_worker_ip_addresses() futures = list() for ip_addr in ip_addrs: futures.append( self.yarn_client.submit(func, ip_addr, *args, **kwargs, workers=ip_addr)) return self.yarn_client.gather(futures) def get_dashboard_link(self): print('local cluster: ', self.local_cluster.dashboard_link) print('yarn cluster: ', self.yarn_cluster.dashboard_link)
import time from dask_yarn import YarnCluster from dask.distributed import Client # Create a cluster cluster = YarnCluster() # Connect to the cluster # client = Client(cluster) cluster.scale(2) time.sleep(100000)
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format(pipeline_context.executor), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.run_config.get("storage") check.invariant(storage.keys(), "Must specify storage to use Dask execution") check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, "Cannot use in-memory storage with Dask, use filesystem, S3, or GCS", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository() variables = { "executionParams": { "selector": { "pipelineName": pipeline_name, "repositoryName": recon_repo.get_definition().name, "repositoryLocationName": "<<in_process>>", }, "runConfigData": run_config, "mode": pipeline_context.mode_def.name, "executionMetadata": {"runId": pipeline_context.pipeline_run.run_id}, "stepKeys": [step.key], } } dask_task_name = "%s.%s" % (pipeline_name, step.key) workspace = create_in_process_ephemeral_workspace( pointer=pipeline_context.pipeline.get_reconstructable_repository().pointer ) future = client.submit( query_on_dask_worker, workspace, variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event