Exemple #1
0
class JobQueueCluster(ClusterManager):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
    or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    cores : int
        Total number of cores per job
    memory: str
        Total amount of memory per job
    processes : int
        Number of processes per job
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : list
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    log_directory : str
        Directory to use for job scheduler logs.
    shebang : str
        Path to desired interpreter for your batch submission script.
    python : str
        Python executable used to launch Dask workers.
    config_name : str
        Section to use from jobqueue.yaml configuration file.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overridden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overridden

    See Also
    --------
    PBSCluster
    SLURMCluster
    SGECluster
    OARCluster
    LSFCluster
    MoabCluster
    """

    _script_template = """
%(shebang)s

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overridden by extending classes.
    submit_command = None
    cancel_command = None
    job_id_regexp = r"(?P<job_id>\d+)"

    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 threads=None,
                 shebang=None,
                 python=sys.executable,
                 config_name=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        super(JobQueueCluster, self).__init__()

        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if config_name is None:
            raise NotImplementedError(
                "JobQueueCluster is an abstract class that should not be instantiated."
            )

        if name is None:
            name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if dask.config.get("jobqueue.%s.threads", None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ["--interface", interface]
            kwargs.setdefault("ip", get_ip_interface(interface))
        else:
            kwargs.setdefault("ip", "")

        # Bokeh diagnostics server should listen on all interfaces
        kwargs.setdefault("dashboard_address", ("", 8787))
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self.shebang = shebang

        self._env_header = "\n".join(env_extra)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", "%s--${JOB_ID}--" % name]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)

    def __repr__(self):
        running_workers = self._count_active_workers()
        running_cores = running_workers * self.worker_process_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)" % (
                    running_cores,
                    format_bytes(running_memory),
                    running_workers,
                    total_workers,
                    len(self.running_jobs),
                    total_jobs,
                ))

    @property
    def pending_jobs(self):
        """ Jobs pending in the queue """
        return self._scheduler_plugin.pending_jobs

    @property
    def running_jobs(self):
        """ Jobs with currently active workers """
        return self._scheduler_plugin.running_jobs

    @property
    def finished_jobs(self):
        """ Jobs that have finished """
        return self._scheduler_plugin.finished_jobs

    @property
    def worker_process_threads(self):
        return int(self.worker_cores / self.worker_processes)

    @property
    def worker_process_memory(self):
        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(" ", "")
        return mem

    @property
    def jobqueue_worker_spec(self):
        """ single worker process info needed for scaling on cores or memory """
        return {
            "cores": self.worker_process_threads,
            "memory": self.worker_process_memory,
        }

    @property
    def workers(self):
        """ workers currently connected to the scheduler """
        return self.scheduler.workers

    def job_script(self):
        """ Construct a job submission script """
        pieces = {
            "shebang": self.shebang,
            "job_header": self.job_header,
            "env_header": self._env_header,
            "worker_command": self._command_template,
        }
        return self._script_template % pieces

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension="sh") as fn:
            with open(fn, "w") as f:
                logger.debug("writing job script: \n%s", self.job_script())
                f.write(self.job_script())
            yield fn

    def _submit_job(self, script_filename):
        return self._call(shlex.split(self.submit_command) + [script_filename])

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        logger.debug("starting %s workers", n)
        num_jobs = int(math.ceil(n / self.worker_processes))
        for _ in range(num_jobs):
            with self.job_file() as fn:
                out = self._submit_job(fn)
                job = self._job_id_from_submit_output(out)
                if not job:
                    raise ValueError(
                        "Unable to parse jobid from output of %s" % out)
                logger.debug("started job: %s", job)
                self.pending_jobs[job] = {}

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.local_cluster.scheduler

    def _call(self, cmd, **kwargs):
        """ Call a command using subprocess.Popen.

        This centralizes calls out to the command line, providing consistent
        outputs, logging, and an opportunity to go asynchronous in the future.

        Parameters
        ----------
        cmd: List(str))
            A command, each of which is a list of strings to hand to
            subprocess.Popen

        Examples
        --------
        >>> self._call(['ls', '/foo'])

        Returns
        -------
        The stdout produced by the command, as string.

        Raises
        ------
        RuntimeError if the command exits with a non-zero exit code
        """
        cmd_str = " ".join(cmd)
        logger.debug(
            "Executing the following command to command line\n{}".format(
                cmd_str))

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                **kwargs)

        out, err = proc.communicate()
        if six.PY3:
            out, err = out.decode(), err.decode()
        if proc.returncode != 0:
            raise RuntimeError("Command exited with non-zero exit code.\n"
                               "Exit code: {}\n"
                               "Command:\n{}\n"
                               "stdout:\n{}\n"
                               "stderr:\n{}\n".format(proc.returncode, cmd_str,
                                                      out, err))
        return out

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        logger.debug("Stopping workers: %s", workers)
        if not workers:
            return
        jobs = self._del_pending_jobs()  # stop pending jobs too
        for w in workers:
            if isinstance(w, dict):
                jobs.append(_job_id_from_worker_name(w["name"]))
            else:
                jobs.append(_job_id_from_worker_name(w.name))
        self.stop_jobs(jobs)

    def stop_jobs(self, jobs):
        """ Stop a list of jobs"""
        logger.debug("Stopping jobs: %s", jobs)
        if jobs:
            jobs = list(jobs)
            self._call(shlex.split(self.cancel_command) + list(set(jobs)))

        # if any of these jobs were pending, we should remove those now
        for job_id in jobs:
            if job_id in self.pending_jobs:
                del self.pending_jobs[job_id]

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        active_and_pending = self._count_active_and_pending_workers()
        if n >= active_and_pending:
            logger.debug("Scaling up to %d workers.", n)
            self.start_workers(n - active_and_pending)
        else:
            # scale_up should not be called if n < active + pending jobs
            logger.warning("JobQueueCluster.scale_up was called with a"
                           " number of workers lower that what is already"
                           " running or pending")

    def _count_active_and_pending_workers(self):
        active_and_pending = (self._count_active_workers() +
                              self._count_pending_workers())
        logger.debug("Found %d active/pending workers.", active_and_pending)
        assert len(self.scheduler.workers) <= active_and_pending
        return active_and_pending

    def _count_active_workers(self):
        active_workers = sum([len(j) for j in self.running_jobs.values()])
        assert len(self.scheduler.workers) == active_workers
        return active_workers

    def _count_pending_workers(self):
        return self.worker_processes * len(self.pending_jobs)

    def scale_down(self, workers, n=None):
        """ Close the workers with the given addresses """
        if n is None:
            # Adaptive currently calls directly scale_down, we need to handle this
            # Need to only keep active workers minus those adaptive wants to stop
            n = self._count_active_workers() - len(workers)
        logger.debug("Scaling down to %d Workers: %s", n, workers)
        active_and_pending = self._count_active_and_pending_workers()
        n_to_close = active_and_pending - n
        if n_to_close < 0:
            logger.warning("JobQueueCluster.scale_down was called with"
                           " a number of worker greater than what is"
                           " already running or pending.")
        elif n_to_close <= self._count_pending_workers():
            # We only need to kill some pending jobs,
            to_kill = int(n_to_close / self.worker_processes)
            jobs = list(self.pending_jobs.keys())[-to_kill:]
            logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs)
            self.stop_jobs(jobs)
        else:
            worker_states = []
            for w in workers:
                try:
                    # Get the actual WorkerState
                    worker_states.append(self.scheduler.workers[w])
                except KeyError:
                    logger.debug("worker %s is already gone", w)
            self.stop_workers(worker_states)

    def stop_all_jobs(self):
        """ Stops all running and pending jobs """
        jobs = self._del_pending_jobs()
        jobs += list(self.running_jobs.keys())
        self.stop_jobs(set(jobs))

    def close(self, **kwargs):
        """ Stops all running and pending jobs and stops scheduler """
        self.stop_all_jobs()
        return self.local_cluster.close(**kwargs)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()
        self.local_cluster.__exit__(type, value, traceback)

    def _del_pending_jobs(self):
        jobs = list(self.pending_jobs.keys())
        logger.debug("Deleting pending jobs %s" % jobs)
        for job_id in jobs:
            del self.pending_jobs[job_id]
        return jobs

    def _job_id_from_submit_output(self, out):
        match = re.search(self.job_id_regexp, out)
        if match is None:
            msg = ("Could not parse job id from submission command "
                   "output.\nJob id regexp is {!r}\nSubmission command "
                   "output is:\n{}".format(self.job_id_regexp, out))
            raise ValueError(msg)

        job_id = match.groupdict().get("job_id")
        if job_id is None:
            msg = (
                "You need to use a 'job_id' named group in your regexp, e.g. "
                "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: "
                "{!r}".format(self.job_id_regexp))
            raise ValueError(msg)

        return job_id

    def worker_key(self, worker_state):
        return _job_id_from_worker_name(worker_state.name)
Exemple #2
0
class JobQueueCluster(Cluster):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate
    for your queueing system (e.g. PBScluster or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    cores : int
        Total number of cores per job
    memory: str
        Total amount of memory per job
    processes : int
        Number of processes per job
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : str
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overriden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overriden

    See Also
    --------
    PBSCluster
    SLURMCluster
    """

    _script_template = """
#!/bin/bash

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overriden by extending classes.
    submit_command = None
    cancel_command = None
    scheduler_name = ''
    _adaptive_options = {
        'worker_key': lambda ws: _job_id_from_worker_name(ws.name)
    }

    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 walltime=None,
                 threads=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' %
                                     self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' %
                                        self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' %
                                        self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' %
                                            self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' %
                                              self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if interface:
            extra += ' --interface  %s ' % interface
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' %
                                        self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError("You must specify how many cores to use per job "
                             "like ``cores=8``")

        if memory is None:
            raise ValueError("You must specify how much memory to use per job "
                             "like ``memory='24 GB'``")

        #This attribute should be overriden
        self.job_header = None

        # Bind to all network addresses by default
        if 'ip' not in kwargs:
            kwargs['ip'] = ''

        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' %
                               dict(python=sys.executable))
        self._command_template = ' '.join(
            [dask_worker_command, self.scheduler.address])
        self._command_template += " --nthreads %d" % self.worker_threads
        if processes is not None and processes > 1:
            self._command_template += " --nprocs %d" % processes

        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(' ', '')
        self._command_template += " --memory-limit %s" % mem
        self._command_template += " --name %s--${JOB_ID}--" % name

        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra

    def __repr__(self):
        running_workers = sum(
            len(value) for value in self.running_jobs.values())
        running_cores = running_workers * self.worker_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' %
                (running_cores, format_bytes(running_memory), running_workers,
                 total_workers, len(self.running_jobs), total_jobs))

    @property
    def pending_jobs(self):
        """ Jobs pending in the queue """
        return self._scheduler_plugin.pending_jobs

    @property
    def running_jobs(self):
        """ Jobs with currenly active workers """
        return self._scheduler_plugin.running_jobs

    @property
    def finished_jobs(self):
        """ Jobs that have finished """
        return self._scheduler_plugin.finished_jobs

    @property
    def worker_threads(self):
        return int(self.worker_cores / self.worker_processes)

    def job_script(self):
        """ Construct a job submission script """
        pieces = {
            'job_header': self.job_header,
            'env_header': self._env_header,
            'worker_command': self._command_template
        }
        return self._script_template % pieces

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension='sh') as fn:
            with open(fn, 'w') as f:
                logger.debug("writing job script: \n%s" % self.job_script())
                f.write(self.job_script())
            yield fn

    def _submit_job(self, script_filename):
        return self._call(shlex.split(self.submit_command) + [script_filename])

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        logger.debug('starting %s workers' % n)
        num_jobs = math.ceil(n / self.worker_processes)
        for _ in range(num_jobs):
            with self.job_file() as fn:
                out = self._submit_job(fn)
                job = self._job_id_from_submit_output(out.decode())
                logger.debug("started job: %s" % job)
                self.pending_jobs[job] = {}

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.local_cluster.scheduler

    def _calls(self, cmds, **kwargs):
        """ Call a command using subprocess.communicate

        This centralzies calls out to the command line, providing consistent
        outputs, logging, and an opportunity to go asynchronous in the future

        Parameters
        ----------
        cmd: List(List(str))
            A list of commands, each of which is a list of strings to hand to
            subprocess.communicate

        Examples
        --------
        >>> self._calls([['ls'], ['ls', '/foo']])

        Returns
        -------
        The stdout result as a string
        Also logs any stderr information
        """
        logger.debug("Submitting the following calls to command line")
        procs = []
        for cmd in cmds:
            logger.debug(' '.join(cmd))
            procs.append(
                subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 **kwargs))

        result = []
        for proc in procs:
            out, err = proc.communicate()
            if err:
                logger.error(err.decode())
            result.append(out)
        return result

    def _call(self, cmd, **kwargs):
        """ Singular version of _calls """
        return self._calls([cmd], **kwargs)[0]

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        logger.debug("Stopping workers: %s" % workers)
        if not workers:
            return
        jobs = self._stop_pending_jobs()  # stop pending jobs too
        for w in workers:
            if isinstance(w, dict):
                jobs.append(_job_id_from_worker_name(w['name']))
            else:
                jobs.append(_job_id_from_worker_name(w.name))
        self.stop_jobs(set(jobs))

    def stop_jobs(self, jobs):
        """ Stop a list of jobs"""
        logger.debug("Stopping jobs: %s" % jobs)
        if jobs:
            jobs = list(jobs)
            self._call([self.cancel_command] + list(set(jobs)))

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        logger.debug("Scaling up to %d workers." % n)
        active_and_pending = sum([len(j) for j in self.running_jobs.values()])
        active_and_pending += self.worker_processes * len(self.pending_jobs)
        logger.debug("Found %d active/pending workers." % active_and_pending)
        self.start_workers(n - active_and_pending)

    def scale_down(self, workers):
        ''' Close the workers with the given addresses '''
        logger.debug("Scaling down. Workers: %s" % workers)
        worker_states = []
        for w in workers:
            try:
                # Get the actual WorkerState
                worker_states.append(self.scheduler.workers[w])
            except KeyError:
                logger.debug('worker %s is already gone' % w)
        self.stop_workers(worker_states)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        jobs = self._stop_pending_jobs()
        jobs += list(self.running_jobs.keys())
        self.stop_jobs(set(jobs))
        self.local_cluster.__exit__(type, value, traceback)

    def _stop_pending_jobs(self):
        jobs = list(self.pending_jobs.keys())
        logger.debug("Stopping pending jobs %s" % jobs)
        for job_id in jobs:
            del self.pending_jobs[job_id]
        return jobs

    def _job_id_from_submit_output(self, out):
        raise NotImplementedError('_job_id_from_submit_output must be '
                                  'implemented when JobQueueCluster is '
                                  'inherited. It should convert the stdout '
                                  'from submit_command to the job id')
Exemple #3
0
class JobQueueCluster(Cluster):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate
    for your queueing system (e.g. PBScluster or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    threads : int
        Number of threads per process.
    processes : int
        Number of processes per node.
    memory : str
        Bytes of memory that the worker can use. This should be a string
        like "7GB" that can be interpretted both by PBS and Dask.
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : str
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overriden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overriden

    See Also
    --------
    PBSCluster
    SLURMCluster
    """

    _script_template = """
#!/bin/bash

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overriden by extending classes.
    submit_command = None
    cancel_command = None

    def __init__(self,
                 name=dask.config.get('jobqueue.name'),
                 threads=dask.config.get('jobqueue.threads'),
                 processes=dask.config.get('jobqueue.processes'),
                 memory=dask.config.get('jobqueue.memory'),
                 interface=dask.config.get('jobqueue.interface'),
                 death_timeout=dask.config.get('jobqueue.death-timeout'),
                 local_directory=dask.config.get('jobqueue.local-directory'),
                 extra=dask.config.get('jobqueue.extra'),
                 env_extra=dask.config.get('jobqueue.env-extra'),
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if not self.cancel_command or not self.submit_command:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_threads = threads
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' %
                               dict(python=sys.executable))
        self._command_template = ' '.join(
            [dask_worker_command, self.scheduler.address])
        if threads is not None:
            self._command_template += " --nthreads %d" % threads
        if processes is not None:
            self._command_template += " --nprocs %d" % processes
        if memory is not None:
            self._command_template += " --memory-limit %s" % memory
        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d"  # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra

    def job_script(self):
        """ Construct a job submission script """
        self.n += 1
        template = self._command_template % {'n': self.n}
        return self._script_template % {
            'job_header': self.job_header,
            'env_header': self._env_header,
            'worker_command': template
        }

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension='sh') as fn:
            with open(fn, 'w') as f:
                f.write(self.job_script())
            yield fn

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        workers = []
        for _ in range(n):
            with self.job_file() as fn:
                out = self._call(shlex.split(self.submit_command) + [fn])
                job = self._job_id_from_submit_output(out.decode())
                self.jobs[self.n] = job
                workers.append(self.n)
        return workers

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.cluster.scheduler

    def _calls(self, cmds):
        """ Call a command using subprocess.communicate

        This centralzies calls out to the command line, providing consistent
        outputs, logging, and an opportunity to go asynchronous in the future

        Parameters
        ----------
        cmd: List(List(str))
            A list of commands, each of which is a list of strings to hand to
            subprocess.communicate

        Examples
        --------
        >>> self._calls([['ls'], ['ls', '/foo']])

        Returns
        -------
        The stdout result as a string
        Also logs any stderr information
        """
        logger.debug("Submitting the following calls to command line")
        for cmd in cmds:
            logger.debug(' '.join(cmd))
        procs = [
            subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE) for cmd in cmds
        ]

        result = []
        for proc in procs:
            out, err = proc.communicate()
            if err:
                logger.error(err.decode())
            result.append(out)
        return result

    def _call(self, cmd):
        """ Singular version of _calls """
        return self._calls([cmd])[0]

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        if not workers:
            return
        workers = list(map(int, workers))
        jobs = [self.jobs[w] for w in workers]
        self._call([self.cancel_command] + list(jobs))
        for w in workers:
            with ignoring(KeyError):
                del self.jobs[w]

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        return self.start_workers(n - len(self.jobs))

    def scale_down(self, workers):
        ''' Close the workers with the given addresses '''
        if isinstance(workers, dict):
            names = {v['name'] for v in workers.values()}
            job_ids = {name.split('-')[-2] for name in names}
            self.stop_workers(job_ids)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.stop_workers(self.jobs)
        self.cluster.__exit__(type, value, traceback)

    def _job_id_from_submit_output(self, out):
        raise NotImplementedError('_job_id_from_submit_output must be '
                                  'implemented when JobQueueCluster is '
                                  'inherited. It should convert the stdout '
                                  'from submit_command to the job id')
Exemple #4
0
class JobQueueCluster(Cluster):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
    or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    cores : int
        Total number of cores per job
    memory: str
        Total amount of memory per job
    processes : int
        Number of processes per job
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : list
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    python : str
        Python executable used to launch Dask workers.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overridden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overridden

    See Also
    --------
    PBSCluster
    SLURMCluster
    SGECluster
    OARCluster
    LSFCluster
    MoabCluster
    """

    _script_template = """
#!/bin/bash

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overridden by extending classes.
    submit_command = None
    cancel_command = None
    scheduler_name = ''
    _adaptive_options = {
        'worker_key': lambda ws: _job_id_from_worker_name(ws.name)
    }
    job_id_regexp = r'(?P<job_id>\d+)'

    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 walltime=None,
                 threads=None,
                 python=sys.executable,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError(
                'JobQueueCluster is an abstract class that should not be instanciated.'
            )

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' %
                                     self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' %
                                        self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' %
                                        self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' %
                                            self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' %
                                              self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' %
                                        self.scheduler_name)
        if log_directory is None:
            log_directory = dask.config.get('jobqueue.%s.log-directory' %
                                            self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ['--interface', interface]
            kwargs.setdefault('ip', get_ip_interface(interface))
        else:
            kwargs.setdefault('ip', '')

        # Bokeh diagnostics server should listen on all interfaces
        diagnostics_ip_and_port = ('', 8787)
        self.local_cluster = LocalCluster(
            n_workers=0, diagnostics_port=diagnostics_ip_and_port, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ['--nthreads', self.worker_threads]
        if processes is not None and processes > 1:
            command_args += ['--nprocs', processes]

        mem = format_bytes(self.worker_memory / self.worker_processes)
        command_args += ['--memory-limit', mem.replace(' ', '')]
        command_args += ['--name', '%s--${JOB_ID}--' % name]

        if death_timeout is not None:
            command_args += ['--death-timeout', death_timeout]
        if local_directory is not None:
            command_args += ['--local-directory', local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = ' '.join(map(str, command_args))

        self._target_scale = 0

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)

    def __repr__(self):
        running_workers = self._count_active_workers()
        running_cores = running_workers * self.worker_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' %
                (running_cores, format_bytes(running_memory), running_workers,
                 total_workers, len(self.running_jobs), total_jobs))

    @property
    def pending_jobs(self):
        """ Jobs pending in the queue """
        return self._scheduler_plugin.pending_jobs

    @property
    def running_jobs(self):
        """ Jobs with currenly active workers """
        return self._scheduler_plugin.running_jobs

    @property
    def finished_jobs(self):
        """ Jobs that have finished """
        return self._scheduler_plugin.finished_jobs

    @property
    def worker_threads(self):
        return int(self.worker_cores / self.worker_processes)

    def job_script(self):
        """ Construct a job submission script """
        pieces = {
            'job_header': self.job_header,
            'env_header': self._env_header,
            'worker_command': self._command_template
        }
        return self._script_template % pieces

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension='sh') as fn:
            with open(fn, 'w') as f:
                logger.debug("writing job script: \n%s", self.job_script())
                f.write(self.job_script())
            yield fn

    def _submit_job(self, script_filename):
        return self._call(shlex.split(self.submit_command) + [script_filename])

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        logger.debug('starting %s workers', n)
        num_jobs = int(math.ceil(n / self.worker_processes))
        for _ in range(num_jobs):
            with self.job_file() as fn:
                out = self._submit_job(fn)
                job = self._job_id_from_submit_output(out.decode())
                if not job:
                    raise ValueError(
                        'Unable to parse jobid from output of %s' % out)
                logger.debug("started job: %s", job)
                self.pending_jobs[job] = {}

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.local_cluster.scheduler

    def _calls(self, cmds, **kwargs):
        """ Call a command using subprocess.communicate

        This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity
        to go asynchronous in the future

        Parameters
        ----------
        cmd: List(List(str))
            A list of commands, each of which is a list of strings to hand to subprocess.communicate

        Examples
        --------
        >>> self._calls([['ls'], ['ls', '/foo']])

        Returns
        -------
        The stdout result as a string
        Also logs any stderr information
        """
        logger.debug("Submitting the following calls to command line")
        procs = []
        for cmd in cmds:
            logger.debug(' '.join(cmd))
            procs.append(
                subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 **kwargs))

        result = []
        for proc in procs:
            out, err = proc.communicate()
            if err:
                raise RuntimeError(err.decode())
            result.append(out)
        return result

    def _call(self, cmd, **kwargs):
        """ Singular version of _calls """
        return self._calls([cmd], **kwargs)[0]

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        logger.debug("Stopping workers: %s", workers)
        if not workers:
            return
        jobs = self._del_pending_jobs()  # stop pending jobs too
        for w in workers:
            if isinstance(w, dict):
                jobs.append(_job_id_from_worker_name(w['name']))
            else:
                jobs.append(_job_id_from_worker_name(w.name))
        self.stop_jobs(jobs)

    def stop_jobs(self, jobs):
        """ Stop a list of jobs"""
        logger.debug("Stopping jobs: %s", jobs)
        if jobs:
            jobs = list(jobs)
            self._call([self.cancel_command] + list(set(jobs)))

        # if any of these jobs were pending, we should remove those now
        for job_id in jobs:
            if job_id in self.pending_jobs:
                del self.pending_jobs[job_id]

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        active_and_pending = self._count_active_and_pending_workers()
        if n >= active_and_pending:
            logger.debug("Scaling up to %d workers.", n)
            self.start_workers(n - self._count_active_and_pending_workers())
        else:
            n_to_close = active_and_pending - n
            if n_to_close < self._count_pending_workers():
                # We only need to kill some pending jobs, this is actually a
                # scale down bu could not be handled upstream
                to_kill = int(n_to_close / self.worker_processes)
                jobs = list(self.pending_jobs.keys())[to_kill:]
                self.stop_jobs(jobs)
            else:
                # We should not end here, a new scale call should not begin
                # until a scale_up or scale_down has ended
                raise RuntimeError('JobQueueCluster.scale_up was called with'
                                   ' a number of worker lower than the '
                                   'currently connected workers')

    def _count_active_and_pending_workers(self):
        active_and_pending = (self._count_active_workers() +
                              self._count_pending_workers())
        logger.debug("Found %d active/pending workers.", active_and_pending)
        assert len(self.scheduler.workers) <= active_and_pending
        return active_and_pending

    def _count_active_workers(self):
        active_workers = sum([len(j) for j in self.running_jobs.values()])
        assert len(self.scheduler.workers) == active_workers
        return active_workers

    def _count_pending_workers(self):
        return self.worker_processes * len(self.pending_jobs)

    def scale_down(self, workers):
        ''' Close the workers with the given addresses '''
        logger.debug("Scaling down. Workers: %s", workers)
        worker_states = []
        for w in workers:
            try:
                # Get the actual WorkerState
                worker_states.append(self.scheduler.workers[w])
            except KeyError:
                logger.debug('worker %s is already gone', w)
        self.stop_workers(worker_states)

    def stop_all_jobs(self):
        ''' Stops all running and pending jobs '''
        jobs = self._del_pending_jobs()
        jobs += list(self.running_jobs.keys())
        self.stop_jobs(set(jobs))

    def close(self):
        ''' Stops all running and pending jobs and stops scheduler '''
        self.stop_all_jobs()
        self.local_cluster.close()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()
        self.local_cluster.__exit__(type, value, traceback)

    def _del_pending_jobs(self):
        jobs = list(self.pending_jobs.keys())
        logger.debug("Deleting pending jobs %s" % jobs)
        for job_id in jobs:
            del self.pending_jobs[job_id]
        return jobs

    def _job_id_from_submit_output(self, out):
        match = re.search(self.job_id_regexp, out)
        if match is None:
            msg = ('Could not parse job id from submission command '
                   "output.\nJob id regexp is {!r}\nSubmission command "
                   'output is:\n{}'.format(self.job_id_regexp, out))
            raise ValueError(msg)

        job_id = match.groupdict().get('job_id')
        if job_id is None:
            msg = (
                "You need to use a 'job_id' named group in your regexp, e.g. "
                "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: "
                "{!r}".format(self.job_id_regexp))
            raise ValueError(msg)

        return job_id