def init_cluster(num_workers, wait_for_all_workers=True): """ Start up a dask cluster, optionally wait until all workers have been launched, and then return the resulting distributed.Client object. Args: num_workers: How many workers to launch. wait_for_all_workers: If True, pause until all workers have been launched before returning. Otherwise, just wait for a single worker to launch. Returns: distributed.Client """ # Local import: LSFCluster probably isn't importable on your local machine, # so it's nice to avoid importing it when you're just running local tests without a cluster. from dask_jobqueue import LSFCluster cluster = LSFCluster(ip='0.0.0.0') cluster.scale(num_workers) required_workers = 1 if wait_for_all_workers: required_workers = num_workers client = Client(cluster) while (wait_for_all_workers and client.status == "running" and len(cluster.scheduler.workers) < required_workers): print( f"Waiting for {required_workers - len(cluster.scheduler.workers)} workers..." ) time.sleep(1.0) return client
def test_informative_errors(): with pytest.raises(ValueError) as info: LSFCluster(memory=None, cores=4) assert "memory" in str(info.value) with pytest.raises(ValueError) as info: LSFCluster(memory="1GB", cores=None) assert "cores" in str(info.value)
def test_informative_errors(): with pytest.raises(ValueError) as info: LSFCluster(memory=None, cores=4) assert 'memory' in str(info.value) with pytest.raises(ValueError) as info: LSFCluster(memory='1GB', cores=None) assert 'cores' in str(info.value)
def adapt(self, minimum, maximum, **kwargs): # Merge kwargs with default kwargs kwargs = {**self.cluster_kwargs, **kwargs} # Call parent adapt method (check values) super().adapt(minimum, maximum, **kwargs) # Make new cluster self._cluster = LSFCluster(**kwargs) # Make client self._client = Client(self._cluster) # Adapt cluster self._cluster.adapt(minimum=minimum, maximum=maximum) # Return client reference return self.client
def test_header(): with LSFCluster(walltime="00:02", processes=4, cores=8, memory="8GB") as cluster: assert "#BSUB" in cluster.job_header assert "#BSUB -J dask-worker" in cluster.job_header assert "#BSUB -n 8" in cluster.job_header assert "#BSUB -M 8000" in cluster.job_header assert "#BSUB -W 00:02" in cluster.job_header assert "#BSUB -q" not in cluster.job_header assert "#BSUB -P" not in cluster.job_header assert "--name dask-worker--${JOB_ID}--" in cluster.job_script() with LSFCluster( queue="general", project="DaskOnLSF", processes=4, cores=8, memory="28GB", ncpus=24, mem=100000000000, ) as cluster: assert "#BSUB -q general" in cluster.job_header assert "#BSUB -J dask-worker" in cluster.job_header assert "#BSUB -n 24" in cluster.job_header assert "#BSUB -n 8" not in cluster.job_header assert "#BSUB -M 100000" in cluster.job_header assert "#BSUB -M 28000" not in cluster.job_header assert "#BSUB -W" in cluster.job_header assert "#BSUB -P DaskOnLSF" in cluster.job_header with LSFCluster(cores=4, memory="8GB") as cluster: assert "#BSUB -n" in cluster.job_header assert "#BSUB -W" in cluster.job_header assert "#BSUB -M" in cluster.job_header assert "#BSUB -q" not in cluster.job_header assert "#BSUB -P" not in cluster.job_header with LSFCluster(cores=4, memory="8GB", job_extra=["-u [email protected]"]) as cluster: assert "#BSUB -u [email protected]" in cluster.job_header assert "#BSUB -n" in cluster.job_header assert "#BSUB -W" in cluster.job_header assert "#BSUB -M" in cluster.job_header assert "#BSUB -q" not in cluster.job_header assert "#BSUB -P" not in cluster.job_header
def test_adaptive_grouped(loop): with LSFCluster( walltime="00:02", processes=1, cores=2, memory="2GB", local_directory="/tmp", loop=loop, ) as cluster: cluster.adapt(minimum=1) # at least 1 worker with Client(cluster) as client: start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 start = time() while not cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT start = time() processes = cluster.worker_processes while len(client.scheduler_info()["workers"]) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT
def test_config_name_lsf_takes_custom_config(): conf = { "queue": "myqueue", "project": "myproject", "ncpus": 1, "cores": 1, "mem": 2, "memory": "2 GB", "walltime": "00:02", "job-extra": [], "lsf-units": "TB", "name": "myname", "processes": 1, "interface": None, "death-timeout": None, "local-directory": "/foo", "extra": [], "env-extra": [], "log-directory": None, "shebang": "#!/usr/bin/env bash", "use-stdin": None, } with dask.config.set({"jobqueue.lsf-config-name": conf}): with LSFCluster(config_name="lsf-config-name") as cluster: assert cluster.job_name == "myname"
def test_config(loop): with dask.config.set( {"jobqueue.lsf.walltime": "00:02", "jobqueue.lsf.local-directory": "/foo"} ): with LSFCluster(loop=loop, cores=1, memory="2GB") as cluster: assert "00:02" in cluster.job_script() assert "--local-directory /foo" in cluster.job_script()
def get_cluster(cluster_type, **kwargs): """Generic dask cluster wrapper""" # check input cluster type cluster_type = cluster_type.lower() cluster_list = ['lsf','pbs','slurm'] if cluster_type not in cluster_list: msg = "Cluster type '{}' not supported".format(cluster_type) msg += '\nsupported cluster types: {}'.format(cluster_list) raise ValueError(msg) print("Dask cluster type: {}".format(cluster_type)) # check input config name if 'config_name' in kwargs.keys(): kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type) print("Dask config name: {}".format(kwargs['config_name'])) # check walltime format for each cluster type if 'walltime' in kwargs.keys(): kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type) print('Dask worker walltime: {}'.format(kwargs['walltime'])) # initiate cluster object if cluster_type == 'lsf': cluster = LSFCluster(**kwargs) elif cluster_type == 'pbs': cluster = PBSCluster(**kwargs) elif cluster_type == 'slurm': cluster = SLURMCluster(**kwargs) return cluster
def test_basic(loop): with LSFCluster(walltime='00:02', processes=1, cores=2, memory='2GB', local_directory='/tmp', loop=loop) as cluster: with Client(cluster) as client: cluster.start_workers(2) assert cluster.pending_jobs or cluster.running_jobs future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()['workers'].values()) w = workers[0] assert w['memory_limit'] == 2e9 assert w['ncores'] == 2 cluster.stop_workers(workers) start = time() while client.scheduler_info()['workers']: sleep(0.100) assert time() < start + QUEUE_WAIT assert not cluster.running_jobs
def test_basic(loop): with LSFCluster( walltime="00:02", processes=1, cores=2, memory="2GB", local_directory="/tmp", loop=loop, ) as cluster: with Client(cluster) as client: cluster.scale(2) assert cluster.pending_jobs or cluster.running_jobs future = client.submit(lambda x: x + 1, 10) assert future.result(QUEUE_WAIT) == 11 assert cluster.running_jobs workers = list(client.scheduler_info()["workers"].values()) w = workers[0] assert w["memory_limit"] == 2e9 assert w["nthreads"] == 2 cluster.stop_workers(workers) start = time() while client.scheduler_info()["workers"]: sleep(0.100) assert time() < start + QUEUE_WAIT assert not cluster.running_jobs
def test_header(): with LSFCluster(walltime='00:02', processes=4, cores=8, memory='8GB') as cluster: assert '#BSUB' in cluster.job_header assert '#BSUB -J dask-worker' in cluster.job_header assert '#BSUB -n 8' in cluster.job_header assert '#BSUB -M 8000' in cluster.job_header assert '#BSUB -W 00:02' in cluster.job_header assert '#BSUB -q' not in cluster.job_header assert '#BSUB -P' not in cluster.job_header assert '--name dask-worker--${JOB_ID}--' in cluster.job_script() with LSFCluster(queue='general', project='DaskOnLSF', processes=4, cores=8, memory='28GB', ncpus=24, mem=100000000000) as cluster: assert '#BSUB -q general' in cluster.job_header assert '#BSUB -J dask-worker' in cluster.job_header assert '#BSUB -n 24' in cluster.job_header assert '#BSUB -n 8' not in cluster.job_header assert '#BSUB -M 100000' in cluster.job_header assert '#BSUB -M 28000' not in cluster.job_header assert '#BSUB -W' in cluster.job_header assert '#BSUB -P DaskOnLSF' in cluster.job_header with LSFCluster(cores=4, memory='8GB') as cluster: assert '#BSUB -n' in cluster.job_header assert '#BSUB -W' in cluster.job_header assert '#BSUB -M' in cluster.job_header assert '#BSUB -q' not in cluster.job_header assert '#BSUB -P' not in cluster.job_header with LSFCluster(cores=4, memory='8GB', job_extra=['-u [email protected]']) as cluster: assert '#BSUB -u [email protected]' in cluster.job_header assert '#BSUB -n' in cluster.job_header assert '#BSUB -W' in cluster.job_header assert '#BSUB -M' in cluster.job_header assert '#BSUB -q' not in cluster.job_header assert '#BSUB -P' not in cluster.job_header
def test_config(loop): # noqa: F811 with dask.config.set({ 'jobqueue.lsf.walltime': '00:02', 'jobqueue.lsf.local-directory': '/foo' }): with LSFCluster(loop=loop, cores=1, memory='2GB') as cluster: assert '00:02' in cluster.job_script() assert '--local-directory /foo' in cluster.job_script()
def initializeLSFCluster(self, cores=1, memory="16GB", processes=1, death_timeout="600s", queue="normal", walltime="1:00", ncpus=1, threads_per_worker=2, mem=16000, **kwargs): """ Initialize a dask_jobqueue.LSFCluster inspired by: https://github.com/janelia-cosem/fst/blob/master/fst/distributed.py LSFCluster API: https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster """ # TODO: add group detection for `project` keyword if 1 <= threads_per_worker <= 2 * cores: tpw = threads_per_worker # shorthand env_extra = [ f"export NUM_MKL_THREADS={tpw}", f"export OPENBLAS_NUM_THREADS={tpw}", f"export OPENMP_NUM_THREADS={tpw}", f"export OMP_NUM_THREADS={tpw}", ] else: raise ValueError("Maximum of 2 threads per core") USER = os.environ["USER"] HOME = os.environ["HOME"] if "local_directory" not in kwargs: kwargs["local_directory"] = f"/scratch/{USER}/" if "log_directory" not in kwargs: log_dir = f"{HOME}/.dask_distributed/" Path(log_dir).mkdir(parents=False, exist_ok=True) kwargs["log_directory"] = log_dir cluster = LSFCluster( queue=queue, walltime=walltime, cores=cores, ncpus=ncpus, memory=memory, env_extra=env_extra, death_timeout=death_timeout, processes=processes, mem=mem, **kwargs, ) self.setCluster(cluster)
def test_job_script(): with LSFCluster(walltime="00:02", processes=4, cores=8, memory="28GB") as cluster: job_script = cluster.job_script() assert "#BSUB" in job_script assert "#BSUB -J dask-worker" in job_script assert "#BSUB -n 8" in job_script assert "#BSUB -M 28000" in job_script assert "#BSUB -W 00:02" in job_script assert "#BSUB -q" not in cluster.job_header assert "#BSUB -P" not in cluster.job_header assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script with LSFCluster( queue="general", project="DaskOnLSF", processes=4, cores=8, memory="28GB", ncpus=24, mem=100000000000, ) as cluster: job_script = cluster.job_script() assert "#BSUB -q general" in cluster.job_header assert "#BSUB -J dask-worker" in cluster.job_header assert "#BSUB -n 24" in cluster.job_header assert "#BSUB -n 8" not in cluster.job_header assert "#BSUB -M 100000" in cluster.job_header assert "#BSUB -M 28000" not in cluster.job_header assert "#BSUB -W" in cluster.job_header assert "#BSUB -P DaskOnLSF" in cluster.job_header assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script
def get_cluster(type, **kwargs): print("Using cluster type: {}".format(type)) if type == 'LSF': cluster = LSFCluster(**kwargs) elif type == 'PBS': cluster = PBSCluster(**kwargs) elif type == 'SLURM': cluster = SLURMCluster(**kwargs) return cluster
class LSFScheduler(Scheduler): # Constructor def __init__(self, min_cores=1, max_cores=1, min_memory='1 GB', max_memory='1 GB', processes=1, walltime='02:00', **kwargs): # Call parent constructor super().__init__(min_cores=min_cores, max_cores=max_cores, min_memory=min_memory, max_memory=max_memory) # Define cluster default parameters self.cluster_kwargs = { **{ 'memory': max_memory, 'cores': min_cores, 'processes': processes, 'walltime': walltime }, **kwargs } # Define adapt method def adapt(self, minimum, maximum, **kwargs): # Merge kwargs with default kwargs kwargs = {**self.cluster_kwargs, **kwargs} # Call parent adapt method (check values) super().adapt(minimum, maximum, **kwargs) # Make new cluster self._cluster = LSFCluster(**kwargs) # Make client self._client = Client(self._cluster) # Adapt cluster self._cluster.adapt(minimum=minimum, maximum=maximum) # Return client reference return self.client
def start_dask_lsfcluster(cluster_size=5): """Start a dask cluster.""" if cluster_size < 4: raise Exception('Too small of a cluster') # Settings for Sanger farm memory_in_gb = 20 cluster = LSFCluster( queue='normal', walltime='00:30', log_directory='{}/dask_logs'.format(os.getcwd()), cores=4, memory='{} Gb'.format(memory_in_gb), mem=memory_in_gb * 1e+9, # should be in bytes lsf_units='mb', job_extra=[ '-G team152', '-g /lt9/dask', '-R "select[mem>{}] rusage[mem={}]"'.format( int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3)) ], use_stdin=True) # View the job submission from Dask # cluster.job_script() # Scale cluster cluster.scale(cluster_size) # auto-scale between 10 and 100 jobs # cluster.adapt( # minimum_jobs=int(cluster_size/4), # maximum_jobs=cluster_size # ) # cluster.adapt(maximum_memory="10 TB") # use core/memory limits client = Client(cluster, timeout=120) client.wait_for_workers(n_workers=cluster_size) # print(client.scheduler_info()['services']) return cluster, client
def test_use_stdin(loop, config_value, constructor_value): """ Verify that use-stdin is respected when passed via the config OR the LSFCluster() constructor """ with dask.config.set({"jobqueue.lsf.use-stdin": config_value}): with LSFCluster( loop=loop, cores=1, memory="2GB", use_stdin=constructor_value ) as cluster: if constructor_value is not None: assert cluster._dummy_job.use_stdin == constructor_value else: assert cluster._dummy_job.use_stdin == config_value
def setup_dask_lsf_cluster( n_workers: int, queue: str, memory_gigabytes: int, wall_time: str, environment_name: str, ) -> "LSFCluster": """Set up a dask cluster which integrates with an existing LSF queue manager to spawn and manage workers. Args: n_workers: The number of workers to spawn. queue: The queue to submit the workers to. memory_gigabytes: The maximum memory to request per worker in GB. wall_time: The maximum wall-clock time to spawn each worker for. environment_name: The conda environment to activate for each worker. Returns: The initialized cluster. """ import dask from dask_jobqueue import LSFCluster env_extra = dask.config.get("jobqueue.lsf.env-extra", default=[]) env_extra.append(f"conda activate {environment_name}") cluster = LSFCluster( queue=queue, cores=1, memory=f"{memory_gigabytes * 1e9}B", walltime=wall_time, local_directory="dask-worker-space", log_directory="dask-worker-logs", env_extra=env_extra, ) cluster.scale(n=n_workers) return cluster
def test_job_script(): with LSFCluster(walltime='00:02', processes=4, cores=8, memory='28GB') as cluster: job_script = cluster.job_script() assert '#BSUB' in job_script assert '#BSUB -J dask-worker' in job_script assert '#BSUB -n 8' in job_script assert '#BSUB -M 28000' in job_script assert '#BSUB -W 00:02' in job_script assert '#BSUB -q' not in cluster.job_header assert '#BSUB -P' not in cluster.job_header assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script with LSFCluster(queue='general', project='DaskOnLSF', processes=4, cores=8, memory='28GB', ncpus=24, mem=100000000000) as cluster: job_script = cluster.job_script() assert '#BSUB -q general' in cluster.job_header assert '#BSUB -J dask-worker' in cluster.job_header assert '#BSUB -n 24' in cluster.job_header assert '#BSUB -n 8' not in cluster.job_header assert '#BSUB -M 100000' in cluster.job_header assert '#BSUB -M 28000' not in cluster.job_header assert '#BSUB -W' in cluster.job_header assert '#BSUB -P DaskOnLSF' in cluster.job_header assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script
def lsf_unit_detection_helper(expected_unit, conf_text=None): temp_dir = tempfile.mkdtemp() current_lsf_envdir = os.environ.get("LSF_ENVDIR", None) os.environ["LSF_ENVDIR"] = temp_dir if conf_text is not None: with open(os.path.join(temp_dir, "lsf.conf"), "w") as conf_file: conf_file.write(conf_text) memory_string = "13GB" memory_base = parse_bytes(memory_string) correct_memory = lsf.lsf_format_bytes_ceil(memory_base, lsf_units=expected_unit) with LSFCluster(memory=memory_string, cores=1) as cluster: assert "#BSUB -M %s" % correct_memory in cluster.job_header rmtree(temp_dir) if current_lsf_envdir is None: del os.environ["LSF_ENVDIR"] else: os.environ["LSF_ENVDIR"] = current_lsf_envdir
def start_dask(num_workers, msg, logger): """Context manager used for starting/shutting down dask Args: num_workers (`int`): Number of dask workers msg (`str`): Message for timer logger: The logger being used Yields: client: Dask client """ # Update dask with open("dask-config.yaml") as f: config = yaml.load(f, Loader=SafeLoader) dask.config.update(dask.config.config, config) cluster_type = next(iter(dask.config.config['jobqueue'])) set_local_directory(cluster_type) if cluster_type == 'local': from dask.distributed import LocalCluster cluster = LocalCluster(n_workers=num_workers, threads_per_worker=1) else: if cluster_type == 'lsf': from dask_jobqueue import LSFCluster cluster = LSFCluster() elif cluster_type == 'slurm': from dask_jobqueue import SLURMCluster cluster = SLURMCluster() elif cluster_type == 'sge': from dask_jobqueue import SGECluster cluster = SGECluster() cluster.scale(num_workers) try: with io_util.Timing_Messager(f"Starting dask cluster for {msg}", logger): client = Client(cluster) io_util.print_with_datetime( f"Check {client.cluster.dashboard_link} for {msg} status.", logger) yield client finally: client.shutdown() client.close()
def get_jobqueue_cluster(walltime='12:00', ncpus=1, cores=1, local_directory=None, memory='15GB', env_extra=None, **kwargs): """ Instantiate a dask_jobqueue cluster using the LSF scheduler on the Janelia Research Campus compute cluster. This function wraps the class dask_jobqueue.LSFCLuster and instantiates this class with some sensible defaults. Extra kwargs added to this function will be passed to LSFCluster(). The full API for the LSFCluster object can be found here: https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster Some of the functions requires dask-jobqueue < 0.7 """ import dask # this is necessary to ensure that workers get the job script from stdin dask.config.set({"jobqueue.lsf.use-stdin": True}) from dask_jobqueue import LSFCluster import os if env_extra is None: env_extra = [ "export NUM_MKL_THREADS=1", "export OPENBLAS_NUM_THREADS=1", "export OPENMP_NUM_THREADS=1", "export OMP_NUM_THREADS=1", ] if local_directory is None: local_directory = '/scratch/' + os.environ['USER'] + '/' cluster = LSFCluster(queue='normal', walltime=walltime, ncpus=ncpus, cores=cores, local_directory=local_directory, memory=memory, env_extra=env_extra, job_extra=["-o /dev/null"], **kwargs) return cluster
def _build_lsf(self): from dask_jobqueue import LSFCluster num_jobs = math.ceil(self.num_workers / self.workers_per_job) cores = self.workers_per_job * self.cores_per_worker memory = self.workers_per_job * self.memory_per_worker jextra = ['-R rusage[mem={}]'.format(self.memory or "")] if "job_extra" in self.cluster_kwargs: self.cluster_kwargs["job_extra"].extend(jextra) elif "job_extra" not in self.cluster_kwargs: self.cluster_kwargs["job_extra"] = jextra cluster = LSFCluster(n_workers=self.num_workers, processes=self.workers_per_job, cores=cores, memory=memory, ncpus=cores, **self.cluster_kwargs) return cluster
def get_jobqueue_cluster(walltime="12:00", cores=1, local_directory=None, memory="16GB", env_extra=None, **kwargs): """ Instantiate a dask_jobqueue cluster using the LSF scheduler on the Janelia Research Campus compute cluster. This function wraps the class dask_jobqueue.LSFCLuster and instantiates this class with some sensible defaults. Extra kwargs added to this function will be passed to LSFCluster(). The full API for the LSFCluster object can be found here: https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.LSFCluster.html#dask_jobqueue.LSFCluster """ from dask_jobqueue import LSFCluster import os if env_extra is None: env_extra = [ "export NUM_MKL_THREADS=1", "export OPENBLAS_NUM_THREADS=1", "export OPENMP_NUM_THREADS=1", "export OMP_NUM_THREADS=1", ] if local_directory is None: local_directory = "/scratch/" + os.environ["USER"] + "/" cluster = LSFCluster(queue="normal", walltime=walltime, cores=cores, local_directory=local_directory, memory=memory, env_extra=env_extra, job_extra=['-o /dev/null'], **kwargs) return cluster
def test_config_name_lsf_takes_custom_config(): conf = { 'queue': 'myqueue', 'project': 'myproject', 'ncpus': 1, 'cores': 1, 'mem': 2, 'memory': '2 GB', 'walltime': '00:02', 'job-extra': [], 'name': 'myname', 'processes': 1, 'interface': None, 'death-timeout': None, 'local-directory': '/foo', 'extra': [], 'env-extra': [], 'log-directory': None, 'shebang': '#!/usr/bin/env bash' } with dask.config.set({'jobqueue.lsf-config-name': conf}): with LSFCluster(config_name='lsf-config-name') as cluster: assert cluster.name == 'myname'
def test_adaptive(loop): with LSFCluster(walltime='00:02', processes=1, cores=2, memory='2GB', local_directory='/tmp', loop=loop) as cluster: cluster.adapt() with Client(cluster) as client: future = client.submit(lambda x: x + 1, 10) start = time() while not (cluster.pending_jobs or cluster.running_jobs): sleep(0.100) assert time() < start + QUEUE_WAIT assert future.result(QUEUE_WAIT) == 11 start = time() processes = cluster.worker_processes while len(client.scheduler_info()['workers']) != processes: sleep(0.1) assert time() < start + QUEUE_WAIT del future start = time() while len(client.scheduler_info()['workers']) > 0: sleep(0.100) assert time() < start + QUEUE_WAIT start = time() while cluster.pending_jobs or cluster.running_jobs: sleep(0.100) assert time() < start + QUEUE_WAIT assert cluster.finished_jobs
def get_cluster(cluster_type, **kwargs): """Generic dask cluster wrapper""" # check input cluster type cluster_type = cluster_type.lower() cluster_list = ['lsf','pbs','slurm','local'] if cluster_type not in cluster_list: msg = "Cluster type '{}' not supported".format(cluster_type) msg += '\nsupported cluster types: {}'.format(cluster_list) raise ValueError(msg) print("Dask cluster type: {}".format(cluster_type)) # No need to do the extra configuration checking if using LocalCluster if cluster_type == 'local': return LocalCluster() # check input config name if 'config_name' in kwargs.keys(): kwargs['config_name'] = check_config_name(kwargs['config_name'], cluster_type) print("Dask config name: {}".format(kwargs['config_name'])) # check walltime format for each cluster type if 'walltime' in kwargs.keys(): kwargs['walltime'] = check_walltime_format(kwargs["walltime"], cluster_type) print('Dask worker walltime: {}'.format(kwargs['walltime'])) # initiate cluster object if cluster_type == 'lsf': cluster = LSFCluster(**kwargs) elif cluster_type == 'pbs': cluster = PBSCluster(**kwargs) elif cluster_type == 'slurm': cluster = SLURMCluster(**kwargs) # Print and write job command file for HPC cluster types print("JOB COMMAND CALLED FROM PYTHON:\n\n", cluster.job_script()) with open('dask_command_run_from_python.txt', 'w') as f: f.write(cluster.job_script() + '\n') return cluster
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event