class JobQueueCluster(ClusterManager): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. cores : int Total number of cores per job memory: str Total amount of memory per job processes : int Number of processes per job interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : list Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. log_directory : str Directory to use for job scheduler logs. shebang : str Path to desired interpreter for your batch submission script. python : str Python executable used to launch Dask workers. config_name : str Section to use from jobqueue.yaml configuration file. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overridden cancel_command: str Abstract attribute for job scheduler cancel command, should be overridden See Also -------- PBSCluster SLURMCluster SGECluster OARCluster LSFCluster MoabCluster """ _script_template = """ %(shebang)s %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None job_id_regexp = r"(?P<job_id>\d+)" def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, log_directory=None, threads=None, shebang=None, python=sys.executable, config_name=None, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ super(JobQueueCluster, self).__init__() if threads is not None: raise ValueError(threads_deprecation_message) if config_name is None: raise NotImplementedError( "JobQueueCluster is an abstract class that should not be instantiated." ) if name is None: name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % config_name) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) if local_directory is None: local_directory = dask.config.get("jobqueue.%s.local-directory" % config_name) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) if dask.config.get("jobqueue.%s.threads", None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError( "You must specify how many cores to use per job like ``cores=8``" ) if memory is None: raise ValueError( "You must specify how much memory to use per job like ``memory='24 GB'``" ) # This attribute should be overridden self.job_header = None if interface: extra += ["--interface", interface] kwargs.setdefault("ip", get_ip_interface(interface)) else: kwargs.setdefault("ip", "") # Bokeh diagnostics server should listen on all interfaces kwargs.setdefault("dashboard_address", ("", 8787)) self.local_cluster = LocalCluster(n_workers=0, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self.shebang = shebang self._env_header = "\n".join(env_extra) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python) command_args = [dask_worker_command, self.scheduler.address] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", "%s--${JOB_ID}--" % name] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory) def __repr__(self): running_workers = self._count_active_workers() running_cores = running_workers * self.worker_process_threads total_jobs = len(self.pending_jobs) + len(self.running_jobs) total_workers = total_jobs * self.worker_processes running_memory = running_workers * self.worker_memory / self.worker_processes return (self.__class__.__name__ + "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)" % ( running_cores, format_bytes(running_memory), running_workers, total_workers, len(self.running_jobs), total_jobs, )) @property def pending_jobs(self): """ Jobs pending in the queue """ return self._scheduler_plugin.pending_jobs @property def running_jobs(self): """ Jobs with currently active workers """ return self._scheduler_plugin.running_jobs @property def finished_jobs(self): """ Jobs that have finished """ return self._scheduler_plugin.finished_jobs @property def worker_process_threads(self): return int(self.worker_cores / self.worker_processes) @property def worker_process_memory(self): mem = format_bytes(self.worker_memory / self.worker_processes) mem = mem.replace(" ", "") return mem @property def jobqueue_worker_spec(self): """ single worker process info needed for scaling on cores or memory """ return { "cores": self.worker_process_threads, "memory": self.worker_process_memory, } @property def workers(self): """ workers currently connected to the scheduler """ return self.scheduler.workers def job_script(self): """ Construct a job submission script """ pieces = { "shebang": self.shebang, "job_header": self.job_header, "env_header": self._env_header, "worker_command": self._command_template, } return self._script_template % pieces @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension="sh") as fn: with open(fn, "w") as f: logger.debug("writing job script: \n%s", self.job_script()) f.write(self.job_script()) yield fn def _submit_job(self, script_filename): return self._call(shlex.split(self.submit_command) + [script_filename]) def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ logger.debug("starting %s workers", n) num_jobs = int(math.ceil(n / self.worker_processes)) for _ in range(num_jobs): with self.job_file() as fn: out = self._submit_job(fn) job = self._job_id_from_submit_output(out) if not job: raise ValueError( "Unable to parse jobid from output of %s" % out) logger.debug("started job: %s", job) self.pending_jobs[job] = {} @property def scheduler(self): """ The scheduler of this cluster """ return self.local_cluster.scheduler def _call(self, cmd, **kwargs): """ Call a command using subprocess.Popen. This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future. Parameters ---------- cmd: List(str)) A command, each of which is a list of strings to hand to subprocess.Popen Examples -------- >>> self._call(['ls', '/foo']) Returns ------- The stdout produced by the command, as string. Raises ------ RuntimeError if the command exits with a non-zero exit code """ cmd_str = " ".join(cmd) logger.debug( "Executing the following command to command line\n{}".format( cmd_str)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) out, err = proc.communicate() if six.PY3: out, err = out.decode(), err.decode() if proc.returncode != 0: raise RuntimeError("Command exited with non-zero exit code.\n" "Exit code: {}\n" "Command:\n{}\n" "stdout:\n{}\n" "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)) return out def stop_workers(self, workers): """ Stop a list of workers""" logger.debug("Stopping workers: %s", workers) if not workers: return jobs = self._del_pending_jobs() # stop pending jobs too for w in workers: if isinstance(w, dict): jobs.append(_job_id_from_worker_name(w["name"])) else: jobs.append(_job_id_from_worker_name(w.name)) self.stop_jobs(jobs) def stop_jobs(self, jobs): """ Stop a list of jobs""" logger.debug("Stopping jobs: %s", jobs) if jobs: jobs = list(jobs) self._call(shlex.split(self.cancel_command) + list(set(jobs))) # if any of these jobs were pending, we should remove those now for job_id in jobs: if job_id in self.pending_jobs: del self.pending_jobs[job_id] def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ active_and_pending = self._count_active_and_pending_workers() if n >= active_and_pending: logger.debug("Scaling up to %d workers.", n) self.start_workers(n - active_and_pending) else: # scale_up should not be called if n < active + pending jobs logger.warning("JobQueueCluster.scale_up was called with a" " number of workers lower that what is already" " running or pending") def _count_active_and_pending_workers(self): active_and_pending = (self._count_active_workers() + self._count_pending_workers()) logger.debug("Found %d active/pending workers.", active_and_pending) assert len(self.scheduler.workers) <= active_and_pending return active_and_pending def _count_active_workers(self): active_workers = sum([len(j) for j in self.running_jobs.values()]) assert len(self.scheduler.workers) == active_workers return active_workers def _count_pending_workers(self): return self.worker_processes * len(self.pending_jobs) def scale_down(self, workers, n=None): """ Close the workers with the given addresses """ if n is None: # Adaptive currently calls directly scale_down, we need to handle this # Need to only keep active workers minus those adaptive wants to stop n = self._count_active_workers() - len(workers) logger.debug("Scaling down to %d Workers: %s", n, workers) active_and_pending = self._count_active_and_pending_workers() n_to_close = active_and_pending - n if n_to_close < 0: logger.warning("JobQueueCluster.scale_down was called with" " a number of worker greater than what is" " already running or pending.") elif n_to_close <= self._count_pending_workers(): # We only need to kill some pending jobs, to_kill = int(n_to_close / self.worker_processes) jobs = list(self.pending_jobs.keys())[-to_kill:] logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs) self.stop_jobs(jobs) else: worker_states = [] for w in workers: try: # Get the actual WorkerState worker_states.append(self.scheduler.workers[w]) except KeyError: logger.debug("worker %s is already gone", w) self.stop_workers(worker_states) def stop_all_jobs(self): """ Stops all running and pending jobs """ jobs = self._del_pending_jobs() jobs += list(self.running_jobs.keys()) self.stop_jobs(set(jobs)) def close(self, **kwargs): """ Stops all running and pending jobs and stops scheduler """ self.stop_all_jobs() return self.local_cluster.close(**kwargs) def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() self.local_cluster.__exit__(type, value, traceback) def _del_pending_jobs(self): jobs = list(self.pending_jobs.keys()) logger.debug("Deleting pending jobs %s" % jobs) for job_id in jobs: del self.pending_jobs[job_id] return jobs def _job_id_from_submit_output(self, out): match = re.search(self.job_id_regexp, out) if match is None: msg = ("Could not parse job id from submission command " "output.\nJob id regexp is {!r}\nSubmission command " "output is:\n{}".format(self.job_id_regexp, out)) raise ValueError(msg) job_id = match.groupdict().get("job_id") if job_id is None: msg = ( "You need to use a 'job_id' named group in your regexp, e.g. " "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: " "{!r}".format(self.job_id_regexp)) raise ValueError(msg) return job_id def worker_key(self, worker_state): return _job_id_from_worker_name(worker_state.name)
class JobQueueCluster(Cluster): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. cores : int Total number of cores per job memory: str Total amount of memory per job processes : int Number of processes per job interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : str Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overriden cancel_command: str Abstract attribute for job scheduler cancel command, should be overriden See Also -------- PBSCluster SLURMCluster """ _script_template = """ #!/bin/bash %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overriden by extending classes. submit_command = None cancel_command = None scheduler_name = '' _adaptive_options = { 'worker_key': lambda ws: _job_id_from_worker_name(ws.name) } def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, walltime=None, threads=None, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used # directly. # """ if threads is not None: raise ValueError(threads_deprecation_message) if not self.scheduler_name: raise NotImplementedError('JobQueueCluster is an abstract class ' 'that should not be instanciated.') if name is None: name = dask.config.get('jobqueue.%s.name' % self.scheduler_name) if cores is None: cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name) if memory is None: memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name) if processes is None: processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name) if interface is None: interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name) if death_timeout is None: death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name) if local_directory is None: local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name) if extra is None: extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name) if interface: extra += ' --interface %s ' % interface if env_extra is None: env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name) if dask.config.get('jobqueue.%s.threads', None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError("You must specify how many cores to use per job " "like ``cores=8``") if memory is None: raise ValueError("You must specify how much memory to use per job " "like ``memory='24 GB'``") #This attribute should be overriden self.job_header = None # Bind to all network addresses by default if 'ip' not in kwargs: kwargs['ip'] = '' self.local_cluster = LocalCluster(n_workers=0, **kwargs) # Keep information on process, threads and memory, for use in # subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)) self._command_template = ' '.join( [dask_worker_command, self.scheduler.address]) self._command_template += " --nthreads %d" % self.worker_threads if processes is not None and processes > 1: self._command_template += " --nprocs %d" % processes mem = format_bytes(self.worker_memory / self.worker_processes) mem = mem.replace(' ', '') self._command_template += " --memory-limit %s" % mem self._command_template += " --name %s--${JOB_ID}--" % name if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra def __repr__(self): running_workers = sum( len(value) for value in self.running_jobs.values()) running_cores = running_workers * self.worker_threads total_jobs = len(self.pending_jobs) + len(self.running_jobs) total_workers = total_jobs * self.worker_processes running_memory = running_workers * self.worker_memory / self.worker_processes return (self.__class__.__name__ + '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' % (running_cores, format_bytes(running_memory), running_workers, total_workers, len(self.running_jobs), total_jobs)) @property def pending_jobs(self): """ Jobs pending in the queue """ return self._scheduler_plugin.pending_jobs @property def running_jobs(self): """ Jobs with currenly active workers """ return self._scheduler_plugin.running_jobs @property def finished_jobs(self): """ Jobs that have finished """ return self._scheduler_plugin.finished_jobs @property def worker_threads(self): return int(self.worker_cores / self.worker_processes) def job_script(self): """ Construct a job submission script """ pieces = { 'job_header': self.job_header, 'env_header': self._env_header, 'worker_command': self._command_template } return self._script_template % pieces @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension='sh') as fn: with open(fn, 'w') as f: logger.debug("writing job script: \n%s" % self.job_script()) f.write(self.job_script()) yield fn def _submit_job(self, script_filename): return self._call(shlex.split(self.submit_command) + [script_filename]) def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ logger.debug('starting %s workers' % n) num_jobs = math.ceil(n / self.worker_processes) for _ in range(num_jobs): with self.job_file() as fn: out = self._submit_job(fn) job = self._job_id_from_submit_output(out.decode()) logger.debug("started job: %s" % job) self.pending_jobs[job] = {} @property def scheduler(self): """ The scheduler of this cluster """ return self.local_cluster.scheduler def _calls(self, cmds, **kwargs): """ Call a command using subprocess.communicate This centralzies calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future Parameters ---------- cmd: List(List(str)) A list of commands, each of which is a list of strings to hand to subprocess.communicate Examples -------- >>> self._calls([['ls'], ['ls', '/foo']]) Returns ------- The stdout result as a string Also logs any stderr information """ logger.debug("Submitting the following calls to command line") procs = [] for cmd in cmds: logger.debug(' '.join(cmd)) procs.append( subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)) result = [] for proc in procs: out, err = proc.communicate() if err: logger.error(err.decode()) result.append(out) return result def _call(self, cmd, **kwargs): """ Singular version of _calls """ return self._calls([cmd], **kwargs)[0] def stop_workers(self, workers): """ Stop a list of workers""" logger.debug("Stopping workers: %s" % workers) if not workers: return jobs = self._stop_pending_jobs() # stop pending jobs too for w in workers: if isinstance(w, dict): jobs.append(_job_id_from_worker_name(w['name'])) else: jobs.append(_job_id_from_worker_name(w.name)) self.stop_jobs(set(jobs)) def stop_jobs(self, jobs): """ Stop a list of jobs""" logger.debug("Stopping jobs: %s" % jobs) if jobs: jobs = list(jobs) self._call([self.cancel_command] + list(set(jobs))) def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ logger.debug("Scaling up to %d workers." % n) active_and_pending = sum([len(j) for j in self.running_jobs.values()]) active_and_pending += self.worker_processes * len(self.pending_jobs) logger.debug("Found %d active/pending workers." % active_and_pending) self.start_workers(n - active_and_pending) def scale_down(self, workers): ''' Close the workers with the given addresses ''' logger.debug("Scaling down. Workers: %s" % workers) worker_states = [] for w in workers: try: # Get the actual WorkerState worker_states.append(self.scheduler.workers[w]) except KeyError: logger.debug('worker %s is already gone' % w) self.stop_workers(worker_states) def __enter__(self): return self def __exit__(self, type, value, traceback): jobs = self._stop_pending_jobs() jobs += list(self.running_jobs.keys()) self.stop_jobs(set(jobs)) self.local_cluster.__exit__(type, value, traceback) def _stop_pending_jobs(self): jobs = list(self.pending_jobs.keys()) logger.debug("Stopping pending jobs %s" % jobs) for job_id in jobs: del self.pending_jobs[job_id] return jobs def _job_id_from_submit_output(self, out): raise NotImplementedError('_job_id_from_submit_output must be ' 'implemented when JobQueueCluster is ' 'inherited. It should convert the stdout ' 'from submit_command to the job id')
class JobQueueCluster(Cluster): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. threads : int Number of threads per process. processes : int Number of processes per node. memory : str Bytes of memory that the worker can use. This should be a string like "7GB" that can be interpretted both by PBS and Dask. interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : str Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overriden cancel_command: str Abstract attribute for job scheduler cancel command, should be overriden See Also -------- PBSCluster SLURMCluster """ _script_template = """ #!/bin/bash %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overriden by extending classes. submit_command = None cancel_command = None def __init__(self, name=dask.config.get('jobqueue.name'), threads=dask.config.get('jobqueue.threads'), processes=dask.config.get('jobqueue.processes'), memory=dask.config.get('jobqueue.memory'), interface=dask.config.get('jobqueue.interface'), death_timeout=dask.config.get('jobqueue.death-timeout'), local_directory=dask.config.get('jobqueue.local-directory'), extra=dask.config.get('jobqueue.extra'), env_extra=dask.config.get('jobqueue.env-extra'), **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used # directly. # """ if not self.cancel_command or not self.submit_command: raise NotImplementedError('JobQueueCluster is an abstract class ' 'that should not be instanciated.') #This attribute should be overriden self.job_header = None if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs) # Keep information on process, threads and memory, for use in # subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_threads = threads self.name = name self.jobs = dict() self.n = 0 self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)) self._command_template = ' '.join( [dask_worker_command, self.scheduler.address]) if threads is not None: self._command_template += " --nthreads %d" % threads if processes is not None: self._command_template += " --nprocs %d" % processes if memory is not None: self._command_template += " --memory-limit %s" % memory if name is not None: self._command_template += " --name %s" % name self._command_template += "-%(n)d" # Keep %(n) to be replaced later if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra def job_script(self): """ Construct a job submission script """ self.n += 1 template = self._command_template % {'n': self.n} return self._script_template % { 'job_header': self.job_header, 'env_header': self._env_header, 'worker_command': template } @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension='sh') as fn: with open(fn, 'w') as f: f.write(self.job_script()) yield fn def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ workers = [] for _ in range(n): with self.job_file() as fn: out = self._call(shlex.split(self.submit_command) + [fn]) job = self._job_id_from_submit_output(out.decode()) self.jobs[self.n] = job workers.append(self.n) return workers @property def scheduler(self): """ The scheduler of this cluster """ return self.cluster.scheduler def _calls(self, cmds): """ Call a command using subprocess.communicate This centralzies calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future Parameters ---------- cmd: List(List(str)) A list of commands, each of which is a list of strings to hand to subprocess.communicate Examples -------- >>> self._calls([['ls'], ['ls', '/foo']]) Returns ------- The stdout result as a string Also logs any stderr information """ logger.debug("Submitting the following calls to command line") for cmd in cmds: logger.debug(' '.join(cmd)) procs = [ subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for cmd in cmds ] result = [] for proc in procs: out, err = proc.communicate() if err: logger.error(err.decode()) result.append(out) return result def _call(self, cmd): """ Singular version of _calls """ return self._calls([cmd])[0] def stop_workers(self, workers): """ Stop a list of workers""" if not workers: return workers = list(map(int, workers)) jobs = [self.jobs[w] for w in workers] self._call([self.cancel_command] + list(jobs)) for w in workers: with ignoring(KeyError): del self.jobs[w] def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ return self.start_workers(n - len(self.jobs)) def scale_down(self, workers): ''' Close the workers with the given addresses ''' if isinstance(workers, dict): names = {v['name'] for v in workers.values()} job_ids = {name.split('-')[-2] for name in names} self.stop_workers(job_ids) def __enter__(self): return self def __exit__(self, type, value, traceback): self.stop_workers(self.jobs) self.cluster.__exit__(type, value, traceback) def _job_id_from_submit_output(self, out): raise NotImplementedError('_job_id_from_submit_output must be ' 'implemented when JobQueueCluster is ' 'inherited. It should convert the stdout ' 'from submit_command to the job id')
class JobQueueCluster(Cluster): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. cores : int Total number of cores per job memory: str Total amount of memory per job processes : int Number of processes per job interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : list Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. python : str Python executable used to launch Dask workers. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overridden cancel_command: str Abstract attribute for job scheduler cancel command, should be overridden See Also -------- PBSCluster SLURMCluster SGECluster OARCluster LSFCluster MoabCluster """ _script_template = """ #!/bin/bash %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None scheduler_name = '' _adaptive_options = { 'worker_key': lambda ws: _job_id_from_worker_name(ws.name) } job_id_regexp = r'(?P<job_id>\d+)' def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, log_directory=None, walltime=None, threads=None, python=sys.executable, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ if threads is not None: raise ValueError(threads_deprecation_message) if not self.scheduler_name: raise NotImplementedError( 'JobQueueCluster is an abstract class that should not be instanciated.' ) if name is None: name = dask.config.get('jobqueue.%s.name' % self.scheduler_name) if cores is None: cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name) if memory is None: memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name) if processes is None: processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name) if interface is None: interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name) if death_timeout is None: death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name) if local_directory is None: local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name) if extra is None: extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name) if env_extra is None: env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name) if log_directory is None: log_directory = dask.config.get('jobqueue.%s.log-directory' % self.scheduler_name) if dask.config.get('jobqueue.%s.threads', None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError( "You must specify how many cores to use per job like ``cores=8``" ) if memory is None: raise ValueError( "You must specify how much memory to use per job like ``memory='24 GB'``" ) # This attribute should be overridden self.job_header = None if interface: extra += ['--interface', interface] kwargs.setdefault('ip', get_ip_interface(interface)) else: kwargs.setdefault('ip', '') # Bokeh diagnostics server should listen on all interfaces diagnostics_ip_and_port = ('', 8787) self.local_cluster = LocalCluster( n_workers=0, diagnostics_port=diagnostics_ip_and_port, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict( python=python) command_args = [dask_worker_command, self.scheduler.address] command_args += ['--nthreads', self.worker_threads] if processes is not None and processes > 1: command_args += ['--nprocs', processes] mem = format_bytes(self.worker_memory / self.worker_processes) command_args += ['--memory-limit', mem.replace(' ', '')] command_args += ['--name', '%s--${JOB_ID}--' % name] if death_timeout is not None: command_args += ['--death-timeout', death_timeout] if local_directory is not None: command_args += ['--local-directory', local_directory] if extra is not None: command_args += extra self._command_template = ' '.join(map(str, command_args)) self._target_scale = 0 self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory) def __repr__(self): running_workers = self._count_active_workers() running_cores = running_workers * self.worker_threads total_jobs = len(self.pending_jobs) + len(self.running_jobs) total_workers = total_jobs * self.worker_processes running_memory = running_workers * self.worker_memory / self.worker_processes return (self.__class__.__name__ + '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' % (running_cores, format_bytes(running_memory), running_workers, total_workers, len(self.running_jobs), total_jobs)) @property def pending_jobs(self): """ Jobs pending in the queue """ return self._scheduler_plugin.pending_jobs @property def running_jobs(self): """ Jobs with currenly active workers """ return self._scheduler_plugin.running_jobs @property def finished_jobs(self): """ Jobs that have finished """ return self._scheduler_plugin.finished_jobs @property def worker_threads(self): return int(self.worker_cores / self.worker_processes) def job_script(self): """ Construct a job submission script """ pieces = { 'job_header': self.job_header, 'env_header': self._env_header, 'worker_command': self._command_template } return self._script_template % pieces @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension='sh') as fn: with open(fn, 'w') as f: logger.debug("writing job script: \n%s", self.job_script()) f.write(self.job_script()) yield fn def _submit_job(self, script_filename): return self._call(shlex.split(self.submit_command) + [script_filename]) def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ logger.debug('starting %s workers', n) num_jobs = int(math.ceil(n / self.worker_processes)) for _ in range(num_jobs): with self.job_file() as fn: out = self._submit_job(fn) job = self._job_id_from_submit_output(out.decode()) if not job: raise ValueError( 'Unable to parse jobid from output of %s' % out) logger.debug("started job: %s", job) self.pending_jobs[job] = {} @property def scheduler(self): """ The scheduler of this cluster """ return self.local_cluster.scheduler def _calls(self, cmds, **kwargs): """ Call a command using subprocess.communicate This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future Parameters ---------- cmd: List(List(str)) A list of commands, each of which is a list of strings to hand to subprocess.communicate Examples -------- >>> self._calls([['ls'], ['ls', '/foo']]) Returns ------- The stdout result as a string Also logs any stderr information """ logger.debug("Submitting the following calls to command line") procs = [] for cmd in cmds: logger.debug(' '.join(cmd)) procs.append( subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)) result = [] for proc in procs: out, err = proc.communicate() if err: raise RuntimeError(err.decode()) result.append(out) return result def _call(self, cmd, **kwargs): """ Singular version of _calls """ return self._calls([cmd], **kwargs)[0] def stop_workers(self, workers): """ Stop a list of workers""" logger.debug("Stopping workers: %s", workers) if not workers: return jobs = self._del_pending_jobs() # stop pending jobs too for w in workers: if isinstance(w, dict): jobs.append(_job_id_from_worker_name(w['name'])) else: jobs.append(_job_id_from_worker_name(w.name)) self.stop_jobs(jobs) def stop_jobs(self, jobs): """ Stop a list of jobs""" logger.debug("Stopping jobs: %s", jobs) if jobs: jobs = list(jobs) self._call([self.cancel_command] + list(set(jobs))) # if any of these jobs were pending, we should remove those now for job_id in jobs: if job_id in self.pending_jobs: del self.pending_jobs[job_id] def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ active_and_pending = self._count_active_and_pending_workers() if n >= active_and_pending: logger.debug("Scaling up to %d workers.", n) self.start_workers(n - self._count_active_and_pending_workers()) else: n_to_close = active_and_pending - n if n_to_close < self._count_pending_workers(): # We only need to kill some pending jobs, this is actually a # scale down bu could not be handled upstream to_kill = int(n_to_close / self.worker_processes) jobs = list(self.pending_jobs.keys())[to_kill:] self.stop_jobs(jobs) else: # We should not end here, a new scale call should not begin # until a scale_up or scale_down has ended raise RuntimeError('JobQueueCluster.scale_up was called with' ' a number of worker lower than the ' 'currently connected workers') def _count_active_and_pending_workers(self): active_and_pending = (self._count_active_workers() + self._count_pending_workers()) logger.debug("Found %d active/pending workers.", active_and_pending) assert len(self.scheduler.workers) <= active_and_pending return active_and_pending def _count_active_workers(self): active_workers = sum([len(j) for j in self.running_jobs.values()]) assert len(self.scheduler.workers) == active_workers return active_workers def _count_pending_workers(self): return self.worker_processes * len(self.pending_jobs) def scale_down(self, workers): ''' Close the workers with the given addresses ''' logger.debug("Scaling down. Workers: %s", workers) worker_states = [] for w in workers: try: # Get the actual WorkerState worker_states.append(self.scheduler.workers[w]) except KeyError: logger.debug('worker %s is already gone', w) self.stop_workers(worker_states) def stop_all_jobs(self): ''' Stops all running and pending jobs ''' jobs = self._del_pending_jobs() jobs += list(self.running_jobs.keys()) self.stop_jobs(set(jobs)) def close(self): ''' Stops all running and pending jobs and stops scheduler ''' self.stop_all_jobs() self.local_cluster.close() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() self.local_cluster.__exit__(type, value, traceback) def _del_pending_jobs(self): jobs = list(self.pending_jobs.keys()) logger.debug("Deleting pending jobs %s" % jobs) for job_id in jobs: del self.pending_jobs[job_id] return jobs def _job_id_from_submit_output(self, out): match = re.search(self.job_id_regexp, out) if match is None: msg = ('Could not parse job id from submission command ' "output.\nJob id regexp is {!r}\nSubmission command " 'output is:\n{}'.format(self.job_id_regexp, out)) raise ValueError(msg) job_id = match.groupdict().get('job_id') if job_id is None: msg = ( "You need to use a 'job_id' named group in your regexp, e.g. " "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: " "{!r}".format(self.job_id_regexp)) raise ValueError(msg) return job_id