def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), 'pipeline_context', 'Expected executor to be DaskExecutor got {}'.format( pipeline_context.executor), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.run_config.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') check.invariant( pipeline_context.instance.is_persistent, 'Dask execution requires a persistent DagsterInstance', ) # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS', ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == 'local': from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'yarn': from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'ssh': from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'pbs': from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'moab': from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'sge': from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == 'lsf': from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'slurm': from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'oar': from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'kube': from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={'in_process': {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) variables = { 'executionParams': { 'selector': { 'pipelineName': pipeline_name, 'repositoryName': recon_repo.get_definition().name, 'repositoryLocationName': '<<in_process>>', }, 'runConfigData': run_config, 'mode': pipeline_context.mode_def.name, 'executionMetadata': { 'runId': pipeline_context.pipeline_run.run_id }, 'stepKeys': [step.key], } } dask_task_name = '%s.%s' % (pipeline_name, step.key) workspace = create_in_process_ephemeral_workspace( pointer=pipeline_context.pipeline. get_reconstructable_repository().pointer) future = client.submit( query_on_dask_worker, workspace, variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def run_dask(options: dict, docker_username: str = None, docker_password: str = None, docker: bool = False, slurm_job_array: bool = False): try: if 'jobqueue' not in options: cluster = LocalCluster() else: jobqueue = options['jobqueue'] gpus = options['gpus'] if 'gpus' in options else 0 if 'slurm' in jobqueue: print("Requesting SLURM cluster:") pprint(jobqueue['slurm']) cluster = SLURMCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['slurm']) if gpus else SLURMCluster(**jobqueue['slurm']) elif 'pbs' in jobqueue: print("Requesting PBS cluster:") pprint(jobqueue['pbs']) cluster = PBSCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['pbs']) if gpus else PBSCluster(**jobqueue['pbs']) elif 'moab' in jobqueue: print("Requesting MOAB cluster:") pprint(jobqueue['moab']) cluster = MoabCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['moab']) if gpus else MoabCluster(**jobqueue['moab']) elif 'sge' in jobqueue: print("Requesting SGE cluster:") pprint(jobqueue['sge']) cluster = SGECluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['sge']) if gpus else SGECluster(**jobqueue['sge']) elif 'lsf' in jobqueue: print("Requesting LSF cluster:") pprint(jobqueue['lsf']) cluster = LSFCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['lsf']) if gpus else LSFCluster(**jobqueue['lsf']) elif 'oar' in jobqueue: print("Requesting OAR cluster:") pprint(jobqueue['oar']) cluster = OARCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['oar']) if gpus else OARCluster(**jobqueue['oar']) else: raise ValueError(f"Unsupported jobqueue configuration: {jobqueue}") print(f"Cluster job script: {cluster.job_script()}") if 'output' in options and 'from' in options['output']: output_path = options['output']['from'] else: output_path = '.' if 'input' not in options: env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}], parameters=params + [{'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed: {future.exception}") else: logger.info(f"Container completed") elif options['input']['kind'] == InputKind.DIRECTORY: input_path = options['input']['path'] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}], parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for directory '{input_path}'") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for directory '{input_path}': {future.exception}") else: logger.info(f"Container completed for directory '{input_path}'") elif options['input']['kind'] == InputKind.FILES: input_path = options['input']['path'] if slurm_job_array: files = os.listdir(input_path) file_id = int(os.environ.get('SLURM_ARRAY_TASK_ID')) current_file = files[file_id] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': file_id}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file '{input_path}'") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for file '{input_path}': {future.exception}") else: logger.info(f"Container completed for file '{input_path}'") logger.info(f"Run succeeded") else: files = os.listdir(input_path) count = len(files) futures = [] if 'jobqueue' not in options: logger.info(f"Processing {count} files in '{input_path}'") else: logger.info(f"Requesting {count} nodes to process {count} files in '{input_path}' with job script:\n{cluster.job_script()}") cluster.scale(count) env = options['env'] if 'env' in options else [] params = deepcopy(options['parameters']) if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 with Client(cluster) as client: num_files = len(files) for i, current_file in tqdm.tqdm(enumerate(files), total=num_files): command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': i}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file {i}") futures.append(submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)) finished = 0 for future in tqdm.tqdm(as_completed(futures), total=num_files): finished += 1 if future.status != 'finished': logger.error(f"Container failed for file {finished}: {future.exception}") else: logger.info(f"Container completed for file {finished}") elif options['input']['kind'] == InputKind.FILE: input_path = options['input']['path'] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file 1") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for file 1") logger.error(future.exception) else: logger.info(f"Container completed for file 1") logger.info(f"Run succeeded") except: logger.error(f"Run failed: {traceback.format_exc()}") raise